From a9257173996ae38d337e9778c1d6b88bb5319910 Mon Sep 17 00:00:00 2001
From: luozixin2 <lzx@diffulex.local>
Date: Sun, 18 Jan 2026 05:36:38 +0000
Subject: [PATCH 01/10] =?UTF-8?q?feat:=20=E6=94=AF=E6=8C=81=20GPTQ=20Marli?=
 =?UTF-8?q?n=20=E5=92=8C=20AWQ=20Marlin=20=E9=87=8F=E5=8C=96=E6=A0=BC?=
 =?UTF-8?q?=E5=BC=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

主要变更：
- 添加 GPTQ Marlin (W4A16) 和 AWQ Marlin (W4A16) 量化策略
- 修复 loader.py 以正确加载 gptq_marlin 格式权重（支持 Marlin 特有的 repacked qweight 和 permuted scales）
- 修改 quantize_model.py 支持导出 gptq_marlin 格式（对称量化 + Marlin repack/permute）
- 更新 linear.py：
  - 添加 _offline_quant_bits 缓冲区存储量化位数
  - 添加 GPTQ runtime shuffle 支持（gptq_shuffle）
  - 添加 GPTQ/AWQ Marlin 的 lazy repack 支持（_maybe_prepare_offline_gptq_marlin/_awq_marlin）
  - 统一使用 vLLM 格式（int32 packed, fp16 scales）
- 简化各策略文件，移除重复代码
- 移除旧的 AllSpark Marlin 实现文件
- 添加多个 benchmark 配置文件（GPTQ/AWQ Marlin 各 bit 版本）
---
 diffulex/engine/model_runner.py               |   8 +-
 diffulex/layer/linear.py                      | 694 ++++++++++++++++--
 diffulex/utils/loader.py                      | 459 +++++++++++-
 diffulex/utils/quantization/quantize_model.py | 403 +++++-----
 diffulex/utils/quantization/registry.py       |  23 +-
 .../utils/quantization/strategies/__init__.py |   6 +-
 .../strategies/linear_awq_marlin_w4a16.py     | 123 ++++
 .../strategies/linear_awq_w4a16.py            | 517 ++-----------
 .../strategies/linear_fp8_w8a16.py            | 433 ++---------
 .../strategies/linear_fp8_w8a8.py             | 506 ++-----------
 .../strategies/linear_gptq_marlin_w4a16.py    | 156 ++++
 .../strategies/linear_gptq_w4a16.py           | 571 +++-----------
 .../strategies/linear_int4_w4a16.py           | 537 ++------------
 .../strategies/linear_int4_w4a8.py            | 478 +-----------
 .../strategies/linear_int8_w8a16.py           | 539 +-------------
 .../strategies/linear_int8_w8a8.py            | 493 +++----------
 .../strategies/linear_marlin_int8_w8a16.py    | 209 +++---
 diffulex_bench/configs/awq_bf16kv_varlen.yml  |  47 ++
 .../configs/awq_marlin_bf16kv_varlen.yml      |  48 ++
 diffulex_bench/configs/fp8_bf16kv_varlen.yml  |  48 ++
 diffulex_bench/configs/gptq_bf16kv_varlen.yml |  47 ++
 .../configs/gptq_bf16kv_varlen_tp2.yml        |  47 ++
 .../configs/gptq_marlin_bf16kv_varlen.yml     |  48 ++
 .../configs/gptq_marlin_w2_bf16kv_varlen.yml  |  47 ++
 .../configs/gptq_marlin_w4_bf16kv_varlen.yml  |  47 ++
 .../configs/gptq_marlin_w8_bf16kv_varlen.yml  |  47 ++
 .../configs/gptq_w2_bf16kv_varlen.yml         |  47 ++
 .../configs/gptq_w8_bf16kv_varlen.yml         |  47 ++
 diffulex_kernel/__init__.py                   |  60 +-
 .../csrc/marlin/allspark_qgemm_w8a16.cu       | 542 --------------
 .../csrc/marlin/allspark_repack.cu            | 163 ----
 .../csrc/marlin/allspark_utils.cuh            | 247 -------
 .../csrc/marlin/torch_bindings_marlin.cpp     |  25 -
 diffulex_kernel/python/marlin_ops.py          | 128 ----
 docs/GPTQ_AWQ_SUPPORT.md                      | 233 ------
 35 files changed, 2720 insertions(+), 5353 deletions(-)
 create mode 100644 diffulex/utils/quantization/strategies/linear_awq_marlin_w4a16.py
 create mode 100644 diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py
 create mode 100644 diffulex_bench/configs/awq_bf16kv_varlen.yml
 create mode 100644 diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml
 create mode 100644 diffulex_bench/configs/fp8_bf16kv_varlen.yml
 create mode 100644 diffulex_bench/configs/gptq_bf16kv_varlen.yml
 create mode 100644 diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml
 create mode 100644 diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml
 create mode 100644 diffulex_bench/configs/gptq_marlin_w2_bf16kv_varlen.yml
 create mode 100644 diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml
 create mode 100644 diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml
 create mode 100644 diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml
 create mode 100644 diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml
 delete mode 100644 diffulex_kernel/csrc/marlin/allspark_qgemm_w8a16.cu
 delete mode 100644 diffulex_kernel/csrc/marlin/allspark_repack.cu
 delete mode 100644 diffulex_kernel/csrc/marlin/allspark_utils.cuh
 delete mode 100644 diffulex_kernel/csrc/marlin/torch_bindings_marlin.cpp
 delete mode 100644 diffulex_kernel/python/marlin_ops.py
 delete mode 100644 docs/GPTQ_AWQ_SUPPORT.md

diff --git a/diffulex/engine/model_runner.py b/diffulex/engine/model_runner.py
index aeeb442..c347fb3 100755
--- a/diffulex/engine/model_runner.py
+++ b/diffulex/engine/model_runner.py
@@ -36,7 +36,13 @@ def __init__(self, config: Config, rank: int, event: Event | list[Event]):
         # Initialize model, sampler, and kv cache
         init_method = f"tcp://{config.master_addr}:{config.master_port}"
         dist.init_process_group("nccl", init_method, world_size=self.world_size, rank=rank, device_id=config.device_ids[rank])
-        device_id = (getattr(config, "device_start", 0) or 0) + rank + config.device_ids[rank]
+        # Choose CUDA device for this TP rank.
+        # config.device_ids is already a list of logical CUDA device indices (respecting CUDA_VISIBLE_DEVICES).
+        # Do NOT add rank again, otherwise rank 1 with device_ids=[0,1] becomes device 2.
+        if getattr(config, "device_ids", None):
+            device_id = config.device_ids[rank]
+        else:
+            device_id = (getattr(config, "device_start", 0) or 0) + rank
         assert 0 <= device_id < torch.cuda.device_count(), f"Invalid device_id {device_id}."
         torch.cuda.set_device(device_id)
         default_dtype = torch.get_default_dtype()
diff --git a/diffulex/layer/linear.py b/diffulex/layer/linear.py
index b34f017..0ba2ceb 100755
--- a/diffulex/layer/linear.py
+++ b/diffulex/layer/linear.py
@@ -89,20 +89,45 @@ def __init__(
         self.register_buffer("_weight_is_quantized", torch.tensor(False, dtype=torch.bool), persistent=False)
         
         # GPTQ/AWQ offline quantized weight storage (W4A16).
-        # GPTQ: qweight (packed int4), qzeros (packed int4), scales (per-group), g_idx (optional)
-        # AWQ: qweight (packed int4), qzeros (packed int4), scales (per-group)
-        self.register_buffer("gptq_qweight", torch.empty(0, dtype=torch.int8), persistent=False)
-        self.register_buffer("gptq_qzeros", torch.empty(0, dtype=torch.int8), persistent=False)
-        self.register_buffer("gptq_scales", torch.empty(0, dtype=torch.float32), persistent=False)
+        # NOTE(vLLM-format):
+        # - GPTQ: qweight int32 [K/pack, N], qzeros int32 [K/group, N/pack],
+        #         scales fp16 [K/group, N], g_idx optional (usually empty when desc_act=False)
+        # - AWQ : qweight int32 [K, N/pack], qzeros int32 [K/group, N/pack],
+        #         scales fp16 [K/group, N]
+        #
+        # Where pack = 32 / bits (bits=4 => pack=8), K=in_features, N=out_features.
+        self.register_buffer("gptq_qweight", torch.empty(0, dtype=torch.int32), persistent=False)
+        self.register_buffer("gptq_qzeros", torch.empty(0, dtype=torch.int32), persistent=False)
+        self.register_buffer("gptq_scales", torch.empty(0, dtype=torch.float16), persistent=False)
         self.register_buffer("gptq_g_idx", torch.empty(0, dtype=torch.int32), persistent=False)
-        self.register_buffer("awq_qweight", torch.empty(0, dtype=torch.int8), persistent=False)
-        self.register_buffer("awq_qzeros", torch.empty(0, dtype=torch.int8), persistent=False)
-        self.register_buffer("awq_scales", torch.empty(0, dtype=torch.float32), persistent=False)
+        self.register_buffer("awq_qweight", torch.empty(0, dtype=torch.int32), persistent=False)
+        self.register_buffer("awq_qzeros", torch.empty(0, dtype=torch.int32), persistent=False)
+        self.register_buffer("awq_scales", torch.empty(0, dtype=torch.float16), persistent=False)
         # Metadata for offline quantized weights
         self.register_buffer("_offline_quant_format", torch.empty(0, dtype=torch.int8), persistent=False)  # 0=none, 1=gptq, 2=awq
+        # Bits for offline GPTQ/AWQ weights (needed for marlin-exported layouts where
+        # we cannot infer bits from packed tensor shapes).
+        self.register_buffer("_offline_quant_bits", torch.tensor(0, dtype=torch.int32), persistent=False)
         self.register_buffer("_offline_quant_group_size", torch.tensor(128, dtype=torch.int32), persistent=False)
         self.register_buffer("_offline_quant_out_features", torch.tensor(0, dtype=torch.int32), persistent=False)
         self.register_buffer("_offline_quant_in_features", torch.tensor(0, dtype=torch.int32), persistent=False)
+        # GPTQ runtime prep state (vLLM requires gptq_shuffle before first gemm).
+        self.register_buffer("_gptq_is_shuffled", torch.tensor(False, dtype=torch.bool), persistent=False)
+
+        # ---- vLLM Marlin variants (GPTQ/AWQ) one-time repack cache ----
+        # These buffers are populated lazily when a *_marlin strategy is selected.
+        self.register_buffer("_gptq_marlin_is_prepared", torch.tensor(False, dtype=torch.bool), persistent=False)
+        self.register_buffer("gptq_marlin_qweight", torch.empty(0, dtype=torch.int32), persistent=False)
+        self.register_buffer("gptq_marlin_scales", torch.empty(0, dtype=torch.float16), persistent=False)
+        self.register_buffer("gptq_marlin_zp", torch.empty(0, dtype=torch.int32), persistent=False)
+        self.register_buffer("gptq_marlin_g_idx", torch.empty(0, dtype=torch.int32), persistent=False)
+        self.register_buffer("gptq_marlin_g_idx_sort_indices", torch.empty(0, dtype=torch.int32), persistent=False)
+        self.register_buffer("gptq_marlin_workspace", torch.empty(0, dtype=torch.int32), persistent=False)
+        self.register_buffer("_awq_marlin_is_prepared", torch.tensor(False, dtype=torch.bool), persistent=False)
+        self.register_buffer("awq_marlin_qweight", torch.empty(0, dtype=torch.int32), persistent=False)
+        self.register_buffer("awq_marlin_scales", torch.empty(0, dtype=torch.float16), persistent=False)
+        self.register_buffer("awq_marlin_zp", torch.empty(0, dtype=torch.int32), persistent=False)
+        self.register_buffer("awq_marlin_workspace", torch.empty(0, dtype=torch.int32), persistent=False)
 
     def has_quantized_weight(self) -> bool:
         return bool(self._weight_is_quantized.item()) and self.quant_weight_int8.numel() > 0 and self.quant_scales.numel() > 0
@@ -140,78 +165,434 @@ def set_offline_quantized_weight(
 
         Args:
             format: "gptq" or "awq"
-            qweight: int8 packed int4 weights [out_features, (in_features + 1) // 2]
-            qzeros: int8 packed int4 zeros [num_groups, (in_features + 1) // 2]
-            scales: float32 per-group scales [num_groups, in_features] or [num_groups]
+            qweight/qzeros/scales: vLLM standard tensors (see notes above).
             out_features: Output features (N)
             in_features: Input features (K)
             group_size: Group size for quantization (default: 128)
-            g_idx: Optional int32 tensor [out_features] for GPTQ group indices (GPTQ only)
+            g_idx: Optional int32 tensor [in_features] for act-order (GPTQ only; usually empty)
         """
+        # NOTE: Offline quantized weights are typically loaded from safetensors on CPU.
+        # In Diffulex, the engine may move modules to CUDA before calling this method,
+        # so we must ensure tensors are moved to the module device here.
+        def _infer_module_device() -> torch.device:
+            w = getattr(self, "weight", None)
+            if isinstance(w, torch.Tensor):
+                return w.device
+            for p in self.parameters(recurse=False):
+                return p.device
+            for b in self.buffers(recurse=False):
+                return b.device
+            return torch.device("cpu")
+
+        module_device = _infer_module_device()
+
         format = format.strip().lower()
         if format not in ("gptq", "awq"):
             raise ValueError(f"Unsupported offline quant format: {format}. Supported: 'gptq', 'awq'")
 
-        if qweight.dtype != torch.int8:
-            raise TypeError(f"qweight must be int8, got {qweight.dtype}")
-        if qzeros.dtype != torch.int8:
-            raise TypeError(f"qzeros must be int8, got {qzeros.dtype}")
-        if scales.dtype != torch.float32:
-            scales = scales.to(dtype=torch.float32)
+        # Infer bits/pack_factor from packed tensor shapes to support GPTQ W2/W4/W8.
+        # vLLM packing convention:
+        # - GPTQ: qweight [K/pack, N], qzeros [K/group, N/pack]
+        # -  AWQ: qweight [K,      N/pack], qzeros [K/group, N/pack]
+        # where pack = 32 / bits and bits must divide 32.
+        if format == "gptq":
+            if int(qweight.shape[0]) <= 0 or in_features % int(qweight.shape[0]) != 0:
+                raise ValueError(
+                    "Cannot infer GPTQ pack_factor from qweight shape: "
+                    f"in_features={in_features}, qweight.shape={tuple(qweight.shape)}"
+                )
+            pack_factor = in_features // int(qweight.shape[0])
+        else:  # awq
+            if int(qweight.shape[1]) <= 0 or out_features % int(qweight.shape[1]) != 0:
+                raise ValueError(
+                    "Cannot infer AWQ pack_factor from qweight shape: "
+                    f"out_features={out_features}, qweight.shape={tuple(qweight.shape)}"
+                )
+            pack_factor = out_features // int(qweight.shape[1])
+        if 32 % pack_factor != 0:
+            raise ValueError(
+                f"Unsupported pack_factor={pack_factor} (requires 32%pack_factor==0) "
+                f"for offline format={format}. "
+                f"in_features={in_features}, out_features={out_features}, "
+                f"qweight.shape={tuple(qweight.shape)}, qzeros.shape={tuple(qzeros.shape)}, scales.shape={tuple(scales.shape)}"
+            )
+        bits = 32 // pack_factor
+        if format == "awq" and bits != 4:
+            raise ValueError(f"AWQ 目前仅支持 4-bit（pack_factor=8），当前推断 bits={bits} (pack_factor={pack_factor})")
+        # Record bits for downstream kernels (esp. marlin path).
+        self._offline_quant_bits = torch.tensor(bits, dtype=torch.int32, device=module_device)
+
+        if qweight.dtype != torch.int32:
+            raise TypeError(f"qweight must be int32 (vLLM format), got {qweight.dtype}")
+        if qzeros.dtype != torch.int32:
+            raise TypeError(f"qzeros must be int32 (vLLM format), got {qzeros.dtype}")
+        if scales.dtype not in (torch.float16, torch.bfloat16, torch.float32):
+            raise TypeError(
+                f"scales must be float16/bfloat16/float32 (vLLM format), got {scales.dtype}"
+            )
+        if scales.dtype != torch.float16:
+            scales = scales.to(dtype=torch.float16)
+
+        # Move to module device before validation/assignment.
+        if qweight.device != module_device:
+            qweight = qweight.to(device=module_device)
+        if qzeros.device != module_device:
+            qzeros = qzeros.to(device=module_device)
+        if scales.device != module_device:
+            scales = scales.to(device=module_device)
+        if g_idx is not None and g_idx.device != module_device:
+            g_idx = g_idx.to(device=module_device)
 
-        num_groups = (out_features + group_size - 1) // group_size
-        expected_qweight_shape = (out_features, (in_features + 1) // 2)
-        expected_qzeros_shape = (num_groups, (in_features + 1) // 2)
+        # group_size == -1 means channelwise in some ecosystems; vLLM normalizes -1 to K.
+        group_size_norm = in_features if group_size == -1 else group_size
+        if group_size_norm <= 0 or (in_features % group_size_norm != 0):
+            raise ValueError(
+                f"Invalid group_size={group_size} for in_features={in_features}. "
+                "Expected group_size == -1 or a positive divisor of in_features."
+            )
+        num_groups = in_features // group_size_norm
+
+        if format == "gptq":
+            expected_qweight_shape = (in_features // pack_factor, out_features)
+            expected_qzeros_shape = (num_groups, out_features // pack_factor)
+            expected_scales_shape = (num_groups, out_features)
+        else:  # awq
+            expected_qweight_shape = (in_features, out_features // pack_factor)
+            expected_qzeros_shape = (num_groups, out_features // pack_factor)
+            expected_scales_shape = (num_groups, out_features)
 
         if qweight.shape != expected_qweight_shape:
             raise ValueError(
-                f"qweight shape mismatch: got {qweight.shape}, expected {expected_qweight_shape}"
+                f"qweight shape mismatch: got {tuple(qweight.shape)}, expected {expected_qweight_shape}"
             )
         if qzeros.shape != expected_qzeros_shape:
             raise ValueError(
-                f"qzeros shape mismatch: got {qzeros.shape}, expected {expected_qzeros_shape}"
+                f"qzeros shape mismatch: got {tuple(qzeros.shape)}, expected {expected_qzeros_shape}"
+            )
+        if scales.shape != expected_scales_shape:
+            raise ValueError(
+                f"scales shape mismatch: got {tuple(scales.shape)}, expected {expected_scales_shape}"
             )
 
         if format == "gptq":
             self.gptq_qweight = qweight
             self.gptq_qzeros = qzeros
             self.gptq_scales = scales
+            if g_idx is not None and getattr(g_idx, "numel", lambda: 1)() == 0:
+                g_idx = None
             if g_idx is not None:
-                if g_idx.shape != (out_features,):
+                if g_idx.shape != (in_features,):
                     raise ValueError(
-                        f"g_idx shape mismatch: got {g_idx.shape}, expected ({out_features},)"
+                        f"g_idx shape mismatch: got {g_idx.shape}, expected ({in_features},)"
                     )
                 if g_idx.dtype != torch.int32:
                     g_idx = g_idx.to(dtype=torch.int32)
                 self.gptq_g_idx = g_idx
             else:
                 # Clear g_idx if not provided
-                self.gptq_g_idx = torch.empty(0, dtype=torch.int32)
-            self._offline_quant_format = torch.tensor(1, dtype=torch.int8)
+                self.gptq_g_idx = torch.empty(0, dtype=torch.int32, device=module_device)
+            self._offline_quant_format = torch.tensor(1, dtype=torch.int8, device=module_device)
+            self._gptq_is_shuffled = torch.tensor(False, dtype=torch.bool, device=module_device)
         else:  # AWQ
             self.awq_qweight = qweight
             self.awq_qzeros = qzeros
             self.awq_scales = scales
             # AWQ doesn't use g_idx, clear it
-            self.gptq_qweight = torch.empty(0, dtype=torch.int8)
-            self.gptq_qzeros = torch.empty(0, dtype=torch.int8)
-            self.gptq_scales = torch.empty(0, dtype=torch.float32)
-            self.gptq_g_idx = torch.empty(0, dtype=torch.int32)
-            self._offline_quant_format = torch.tensor(2, dtype=torch.int8)
+            self.gptq_qweight = torch.empty(0, dtype=torch.int32, device=module_device)
+            self.gptq_qzeros = torch.empty(0, dtype=torch.int32, device=module_device)
+            self.gptq_scales = torch.empty(0, dtype=torch.float16, device=module_device)
+            self.gptq_g_idx = torch.empty(0, dtype=torch.int32, device=module_device)
+            self._offline_quant_format = torch.tensor(2, dtype=torch.int8, device=module_device)
+            self._gptq_is_shuffled = torch.tensor(False, dtype=torch.bool, device=module_device)
+
+        # Reset marlin-prep caches (weights may have changed / moved).
+        self._gptq_marlin_is_prepared = torch.tensor(False, dtype=torch.bool, device=module_device)
+        self.gptq_marlin_qweight = torch.empty(0, dtype=torch.int32, device=module_device)
+        self.gptq_marlin_scales = torch.empty(0, dtype=torch.float16, device=module_device)
+        self.gptq_marlin_zp = torch.empty(0, dtype=torch.int32, device=module_device)
+        self.gptq_marlin_g_idx = torch.empty(0, dtype=torch.int32, device=module_device)
+        self.gptq_marlin_g_idx_sort_indices = torch.empty(0, dtype=torch.int32, device=module_device)
+        self.gptq_marlin_workspace = torch.empty(0, dtype=torch.int32, device=module_device)
+        self._awq_marlin_is_prepared = torch.tensor(False, dtype=torch.bool, device=module_device)
+        self.awq_marlin_qweight = torch.empty(0, dtype=torch.int32, device=module_device)
+        self.awq_marlin_scales = torch.empty(0, dtype=torch.float16, device=module_device)
+        self.awq_marlin_zp = torch.empty(0, dtype=torch.int32, device=module_device)
+        self.awq_marlin_workspace = torch.empty(0, dtype=torch.int32, device=module_device)
 
-        self._offline_quant_group_size = torch.tensor(group_size, dtype=torch.int32)
-        self._offline_quant_out_features = torch.tensor(out_features, dtype=torch.int32)
-        self._offline_quant_in_features = torch.tensor(in_features, dtype=torch.int32)
+        self._offline_quant_group_size = torch.tensor(group_size, dtype=torch.int32, device=module_device)
+        self._offline_quant_out_features = torch.tensor(out_features, dtype=torch.int32, device=module_device)
+        self._offline_quant_in_features = torch.tensor(in_features, dtype=torch.int32, device=module_device)
 
         # Drop bf16 weight Parameter if present (to free memory)
         if "weight" in self._parameters:
             self._parameters.pop("weight", None)
             setattr(self, "weight", None)
 
+    def _maybe_prepare_offline_gptq(self, x: torch.Tensor) -> None:
+        """Prepare vLLM GPTQ weights on first use (required gptq_shuffle)."""
+        if self._offline_quant_format.numel() == 0:
+            return
+        if int(self._offline_quant_format.item()) != 1:
+            return
+        if self.gptq_qweight.numel() == 0:
+            return
+        if self._gptq_is_shuffled.numel() > 0 and bool(self._gptq_is_shuffled.item()):
+            return
+
+        # Lazy import to avoid pulling vLLM unless GPTQ offline weights are used.
+        try:
+            from vllm import _custom_ops as ops  # type: ignore
+        except Exception as e:  # pragma: no cover
+            raise RuntimeError(
+                "GPTQ offline 权重已加载，但无法导入 vLLM CUDA custom ops（vllm._custom_ops）。"
+            ) from e
+
+        # vLLM uses torch.int for g_idx (can be empty when desc_act=False).
+        if self.gptq_g_idx.numel() == 0:
+            g_idx = torch.empty((0,), device=x.device, dtype=torch.int)
+        else:
+            g_idx = self.gptq_g_idx.to(device=x.device, dtype=torch.int)
+
+        if self.gptq_qweight.device != x.device:
+            raise RuntimeError(
+                f"GPTQ qweight device mismatch: qweight on {self.gptq_qweight.device}, x on {x.device}. "
+                "请确保模型与输入在同一设备。"
+            )
+
+        # Infer weight_bits from packed qweight shape to support GPTQ W2/W4/W8.
+        # qweight: [K/pack_factor, N], where pack_factor = 32 / weight_bits.
+        in_features = int(self._offline_quant_in_features.item()) if self._offline_quant_in_features.numel() > 0 else None
+        if in_features is None or in_features <= 0:
+            raise RuntimeError("GPTQ offline 权重已加载，但无法推断 in_features 以计算 weight_bits。")
+        if self.gptq_qweight.shape[0] <= 0 or in_features % int(self.gptq_qweight.shape[0]) != 0:
+            raise RuntimeError(
+                f"GPTQ qweight shape 不合法，无法推断 weight_bits: "
+                f"in_features={in_features}, qweight.shape={tuple(self.gptq_qweight.shape)}"
+            )
+        pack_factor = in_features // int(self.gptq_qweight.shape[0])
+        if 32 % pack_factor != 0:
+            raise RuntimeError(
+                f"GPTQ pack_factor={pack_factor} 不支持（需要 32 % pack_factor == 0），"
+                f"in_features={in_features}, qweight.shape={tuple(self.gptq_qweight.shape)}"
+            )
+        weight_bits = 32 // pack_factor
+        ops.gptq_shuffle(self.gptq_qweight, g_idx, weight_bits)
+        self._gptq_is_shuffled = torch.tensor(True, dtype=torch.bool, device=x.device)
+
+    def _maybe_prepare_offline_gptq_marlin(self, x: torch.Tensor) -> None:
+        """Prepare vLLM GPTQ Marlin weights on first use (repack + permute scales/zp).
+
+        IMPORTANT: This path must NOT call `gptq_shuffle` (that is specific to gptq_gemm/exllama).
+        """
+        if self._offline_quant_format.numel() == 0:
+            return
+        if int(self._offline_quant_format.item()) != 1:
+            return
+        if self.gptq_qweight.numel() == 0:
+            return
+        if self._gptq_marlin_is_prepared.numel() > 0 and bool(self._gptq_marlin_is_prepared.item()):
+            return
+
+        try:
+            from vllm import _custom_ops as ops  # type: ignore
+            from vllm.model_executor.layers.quantization.utils.marlin_utils import (  # type: ignore
+                marlin_make_empty_g_idx,
+                marlin_make_workspace_new,
+                marlin_permute_scales,
+                marlin_sort_g_idx,
+                marlin_zero_points,
+                unpack_cols,
+            )
+        except Exception as e:  # pragma: no cover
+            raise RuntimeError(
+                "GPTQ Marlin 需要 vLLM CUDA custom ops + marlin_utils，但当前环境不可用。"
+            ) from e
+
+        device = x.device
+        if self.gptq_qweight.device != device:
+            raise RuntimeError(
+                f"GPTQ qweight device mismatch: qweight on {self.gptq_qweight.device}, x on {device}. "
+                "请确保模型与输入在同一设备。"
+            )
+
+        in_features = int(self._offline_quant_in_features.item()) if self._offline_quant_in_features.numel() > 0 else 0
+        out_features = int(self._offline_quant_out_features.item()) if self._offline_quant_out_features.numel() > 0 else 0
+        group_size = int(self._offline_quant_group_size.item()) if self._offline_quant_group_size.numel() > 0 else 128
+        if in_features <= 0 or out_features <= 0:
+            raise RuntimeError(
+                f"GPTQ Marlin: invalid feature sizes: in_features={in_features}, out_features={out_features}"
+            )
+
+        # Determine weight_bits.
+        # - Standard GPTQ layout: infer from qweight K packing.
+        # - Marlin-exported layout: bits cannot be inferred from qweight shape; use recorded bits.
+        weight_bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 0
+        if weight_bits <= 0:
+            if self.gptq_qweight.shape[0] <= 0 or in_features % int(self.gptq_qweight.shape[0]) != 0:
+                raise RuntimeError(
+                    "GPTQ Marlin: cannot infer pack_factor from qweight shape: "
+                    f"in_features={in_features}, qweight.shape={tuple(self.gptq_qweight.shape)}"
+                )
+            pack_factor = in_features // int(self.gptq_qweight.shape[0])
+            if 32 % pack_factor != 0:
+                raise RuntimeError(
+                    f"GPTQ Marlin: unsupported pack_factor={pack_factor} (requires 32%pack_factor==0)"
+                )
+            weight_bits = 32 // pack_factor
+        if weight_bits not in (4, 8):
+            raise RuntimeError(
+                f"GPTQ Marlin: only 4/8-bit are supported in this integration, got bits={weight_bits}"
+            )
+
+        # If loader already provided marlin-ready weights/scales (exported offline),
+        # skip repack/permute but still create workspace / g_idx metadata.
+        already_marlin_ready = (
+            self.gptq_marlin_qweight.numel() > 0
+            and self.gptq_marlin_scales.numel() > 0
+        )
+        if already_marlin_ready:
+            if self.gptq_marlin_qweight.device != device or self.gptq_marlin_scales.device != device:
+                raise RuntimeError(
+                    "GPTQ Marlin: prepacked marlin tensors device mismatch: "
+                    f"qweight on {self.gptq_marlin_qweight.device}, scales on {self.gptq_marlin_scales.device}, x on {device}."
+                )
+
+        # g_idx (act-order) handling: marlin expects sorted g_idx + sort indices; otherwise empty.
+        if self.gptq_g_idx.numel() > 0:
+            g_idx_sorted, g_idx_sort_indices = marlin_sort_g_idx(self.gptq_g_idx.to(device=device, dtype=torch.int32))
+            self.gptq_marlin_g_idx = g_idx_sorted
+            self.gptq_marlin_g_idx_sort_indices = g_idx_sort_indices
+        else:
+            self.gptq_marlin_g_idx = marlin_make_empty_g_idx(device)
+            self.gptq_marlin_g_idx_sort_indices = marlin_make_empty_g_idx(device)
+
+        # Workspace (internal locking mechanism).
+        self.gptq_marlin_workspace = marlin_make_workspace_new(device)
+
+        if not already_marlin_ready:
+            # Repack qweight to marlin format.
+            self.gptq_marlin_qweight = ops.gptq_marlin_repack(
+                self.gptq_qweight.contiguous(),
+                perm=self.gptq_marlin_g_idx_sort_indices,
+                size_k=in_features,
+                size_n=out_features,
+                num_bits=weight_bits,
+                is_a_8bit=False,
+            )
+
+            # Permute scales to marlin format.
+            self.gptq_marlin_scales = marlin_permute_scales(
+                self.gptq_scales.contiguous(),
+                size_k=in_features,
+                size_n=out_features,
+                group_size=group_size,
+                is_a_8bit=False,
+            )
+
+        # GPTQ Marlin only supports symmetric weights (no runtime zero-points).
+        # Use empty zp to keep has_zp=False in the kernel.
+        self.gptq_marlin_zp = marlin_make_empty_g_idx(device)
+
+        self._gptq_marlin_is_prepared = torch.tensor(True, dtype=torch.bool, device=device)
+
+    def _maybe_prepare_offline_awq_marlin(self, x: torch.Tensor) -> None:
+        """Prepare vLLM AWQ Marlin weights on first use (repack + permute scales/zp)."""
+        if self._offline_quant_format.numel() == 0:
+            return
+        if int(self._offline_quant_format.item()) != 2:
+            return
+        if self.awq_qweight.numel() == 0:
+            return
+        if self._awq_marlin_is_prepared.numel() > 0 and bool(self._awq_marlin_is_prepared.item()):
+            return
+
+        try:
+            from vllm import _custom_ops as ops  # type: ignore
+            from vllm.model_executor.layers.quantization.utils.marlin_utils import (  # type: ignore
+                awq_to_marlin_zero_points,
+                marlin_make_empty_g_idx,
+                marlin_make_workspace_new,
+                marlin_permute_scales,
+            )
+        except Exception as e:  # pragma: no cover
+            raise RuntimeError(
+                "AWQ Marlin 需要 vLLM CUDA custom ops + marlin_utils，但当前环境不可用。"
+            ) from e
+
+        device = x.device
+        if self.awq_qweight.device != device:
+            raise RuntimeError(
+                f"AWQ qweight device mismatch: qweight on {self.awq_qweight.device}, x on {device}. "
+                "请确保模型与输入在同一设备。"
+            )
+
+        in_features = int(self._offline_quant_in_features.item()) if self._offline_quant_in_features.numel() > 0 else 0
+        out_features = int(self._offline_quant_out_features.item()) if self._offline_quant_out_features.numel() > 0 else 0
+        group_size = int(self._offline_quant_group_size.item()) if self._offline_quant_group_size.numel() > 0 else 128
+        if in_features <= 0 or out_features <= 0:
+            raise RuntimeError(
+                f"AWQ Marlin: invalid feature sizes: in_features={in_features}, out_features={out_features}"
+            )
+
+        # AWQ is 4-bit only.
+        pack_factor = out_features // int(self.awq_qweight.shape[1])
+        if pack_factor != 8:
+            raise RuntimeError(f"AWQ Marlin: expected pack_factor=8 (W4), got pack_factor={pack_factor}")
+        weight_bits = 4
+        num_groups = (in_features // (in_features if group_size == -1 else group_size))
+
+        self.awq_marlin_workspace = marlin_make_workspace_new(device)
+
+        # Repack qweight to marlin format.
+        self.awq_marlin_qweight = ops.awq_marlin_repack(
+            self.awq_qweight,
+            size_k=in_features,
+            size_n=out_features,
+            num_bits=weight_bits,
+            is_a_8bit=False,
+        )
+
+        # Permute scales to marlin format.
+        self.awq_marlin_scales = marlin_permute_scales(
+            self.awq_scales,
+            size_k=in_features,
+            size_n=out_features,
+            group_size=group_size,
+            is_a_8bit=False,
+        )
+
+        # Convert zero-points to marlin format.
+        self.awq_marlin_zp = awq_to_marlin_zero_points(
+            self.awq_qzeros,
+            size_k=num_groups,
+            size_n=out_features,
+            num_bits=weight_bits,
+            is_a_8bit=False,
+        )
+
+        # g_idx not used for AWQ marlin (keep empty, strategy will pass empties).
+        _ = marlin_make_empty_g_idx  # keep import referenced for clarity
+        self._awq_marlin_is_prepared = torch.tensor(True, dtype=torch.bool, device=device)
+
     def set_quantized_weight(self, quant_weight_int8: torch.Tensor, quant_scales: torch.Tensor) -> None:
-        # Support both int8 (for int8/int4 quantization) and uint8 (for FP8 quantization)
-        if quant_weight_int8.dtype not in (torch.int8, torch.uint8):
-            raise TypeError(f"quant_weight_int8 must be int8 or uint8, got {quant_weight_int8.dtype}")
+        # Support:
+        # - int8: int8/int4 weight-only quantization
+        # - float8: FP8 weight-only quantization (vLLM-aligned)
+        # - uint8: legacy FP8 storage (kept for backward compatibility)
+        fp8_dtypes: tuple[torch.dtype, ...] = tuple(
+            d
+            for d in (
+                getattr(torch, "float8_e4m3fn", None),
+                getattr(torch, "float8_e4m3fnuz", None),
+                getattr(torch, "float8_e5m2", None),
+                getattr(torch, "float8_e5m2fnuz", None),
+            )
+            if d is not None
+        )
+        if quant_weight_int8.dtype not in (torch.int8, torch.uint8, *fp8_dtypes):
+            raise TypeError(
+                f"quant_weight_int8 must be int8/uint8/float8, got {quant_weight_int8.dtype}"
+            )
         # Store scales dtype depends on strategy:
         # - W8A16/W4A16 kernels currently take bf16 scales.
         # - W8A8/W4A8 paths are more sensitive to scale precision; keep scales at fp16.
@@ -237,6 +618,43 @@ def set_quantized_weight(self, quant_weight_int8: torch.Tensor, quant_scales: to
         self.quant_scales = quant_scales
         self._weight_is_quantized.fill_(True)
 
+    def _maybe_promote_weight_to_quantized_at_runtime(
+        self,
+        x: torch.Tensor,
+        strategy,
+        *,
+        expected_weight_formats: tuple[str, ...] = ("int8", "int4", "fp8_e4m3", "fp8_e5m2"),
+    ) -> None:
+        """Runtime safety net: if a Linear is configured for quantization but the bf16/fp16
+        weight Parameter was not quantized+removed at load-time (e.g., due to sharded load
+        ordering), quantize once on first forward and drop the bf16 weight Parameter.
+
+        This avoids keeping both bf16 weights and quantized weights resident on GPU.
+        """
+        if strategy is None:
+            return
+        if self.has_offline_quantized_weight() or self.has_quantized_weight():
+            return
+        weight_param = self._parameters.get("weight", None)
+        if weight_param is None:
+            return
+        weight_format = getattr(strategy, "linear_weight_format", None)
+        if weight_format not in expected_weight_formats:
+            return
+        if getattr(strategy, "name", "").startswith("linear_stub"):
+            return
+        w = getattr(self, "weight", None)
+        if w is None or getattr(w, "dtype", None) not in (torch.bfloat16, torch.float16):
+            return
+        try:
+            qweight, scales = strategy.quantize_weight_for_kernel(w.data, device=w.data.device)
+        except Exception:
+            return
+        self.set_quantized_weight(qweight, scales)
+        # Drop bf16 weight Parameter to free GPU memory.
+        self._parameters.pop("weight", None)
+        setattr(self, "weight", None)
+
     def _maybe_quantize_loaded_weight_param(
         self,
         param: nn.Parameter,
@@ -322,6 +740,8 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         strategy = get_linear_strategy(self.quant_kind)
+        # Runtime safety net: ensure we don't keep bf16+quant weights both resident.
+        self._maybe_promote_weight_to_quantized_at_runtime(x, strategy)
         
         # Check for offline quantized weights (GPTQ/AWQ) first
         if self.has_offline_quantized_weight():
@@ -331,6 +751,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             out_features = int(self._offline_quant_out_features.item())
             in_features = int(self._offline_quant_in_features.item())
             group_size = int(self._offline_quant_group_size.item())
+            weight_format = getattr(strategy, "linear_weight_format", None)
             
             kwargs = {
                 "out_features": out_features,
@@ -339,21 +760,60 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             }
             
             if format_val == 1:  # GPTQ
-                kwargs.update({
-                    "gptq_qweight": self.gptq_qweight,
-                    "gptq_qzeros": self.gptq_qzeros,
-                    "gptq_scales": self.gptq_scales,
-                    "gptq_group_size": group_size,
-                })
-                if self.gptq_g_idx.numel() > 0:
+                # IMPORTANT: only gptq_gemm needs gptq_shuffle; marlin variants require the original format.
+                if weight_format == "gptq":
+                    self._maybe_prepare_offline_gptq(x)
+                    kwargs.update({
+                        "gptq_qweight": self.gptq_qweight,
+                        "gptq_qzeros": self.gptq_qzeros,
+                        "gptq_scales": self.gptq_scales,
+                        "gptq_group_size": group_size,
+                    })
+                    # Always pass g_idx (can be empty). vLLM expects it for GPTQ kernels.
                     kwargs["gptq_g_idx"] = self.gptq_g_idx
+                elif weight_format == "gptq_marlin":
+                    self._maybe_prepare_offline_gptq_marlin(x)
+                    # Expose bits (needed to select scalar_types.* in strategy).
+                    bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 0
+                    if bits <= 0:
+                        pack_factor = in_features // int(self.gptq_qweight.shape[0])
+                        bits = 32 // pack_factor
+                    kwargs["gptq_weight_bits"] = bits
+                    kwargs.update({
+                        "gptq_marlin_qweight": self.gptq_marlin_qweight,
+                        "gptq_marlin_scales": self.gptq_marlin_scales,
+                        "gptq_marlin_zp": self.gptq_marlin_zp,
+                        "gptq_marlin_g_idx": self.gptq_marlin_g_idx,
+                        "gptq_marlin_g_idx_sort_indices": self.gptq_marlin_g_idx_sort_indices,
+                        "gptq_marlin_workspace": self.gptq_marlin_workspace,
+                    })
+                else:
+                    raise RuntimeError(
+                        f"Offline GPTQ weights are present, but current strategy weight_format={weight_format!r} "
+                        "is not compatible."
+                    )
             elif format_val == 2:  # AWQ
-                kwargs.update({
-                    "awq_qweight": self.awq_qweight,
-                    "awq_qzeros": self.awq_qzeros,
-                    "awq_scales": self.awq_scales,
-                    "awq_group_size": group_size,
-                })
+                if weight_format == "awq":
+                    kwargs.update({
+                        "awq_qweight": self.awq_qweight,
+                        "awq_qzeros": self.awq_qzeros,
+                        "awq_scales": self.awq_scales,
+                        "awq_group_size": group_size,
+                    })
+                elif weight_format == "awq_marlin":
+                    self._maybe_prepare_offline_awq_marlin(x)
+                    kwargs.update({
+                        "awq_marlin_qweight": self.awq_marlin_qweight,
+                        "awq_marlin_scales": self.awq_marlin_scales,
+                        "awq_marlin_zp": self.awq_marlin_zp,
+                        "awq_marlin_workspace": self.awq_marlin_workspace,
+                        "awq_weight_bits": 4,
+                    })
+                else:
+                    raise RuntimeError(
+                        f"Offline AWQ weights are present, but current strategy weight_format={weight_format!r} "
+                        "is not compatible."
+                    )
             
             base_out = strategy.linear_forward(
                 x,
@@ -427,6 +887,8 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         strategy = get_linear_strategy(self.quant_kind)
+        # Runtime safety net: ensure we don't keep bf16+quant weights both resident.
+        self._maybe_promote_weight_to_quantized_at_runtime(x, strategy)
         
         # Check for offline quantized weights (GPTQ/AWQ) first
         if self.has_offline_quantized_weight():
@@ -436,6 +898,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             out_features = int(self._offline_quant_out_features.item())
             in_features = int(self._offline_quant_in_features.item())
             group_size = int(self._offline_quant_group_size.item())
+            weight_format = getattr(strategy, "linear_weight_format", None)
             
             kwargs = {
                 "out_features": out_features,
@@ -444,21 +907,57 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             }
             
             if format_val == 1:  # GPTQ
-                kwargs.update({
-                    "gptq_qweight": self.gptq_qweight,
-                    "gptq_qzeros": self.gptq_qzeros,
-                    "gptq_scales": self.gptq_scales,
-                    "gptq_group_size": group_size,
-                })
-                if self.gptq_g_idx.numel() > 0:
+                if weight_format == "gptq":
+                    self._maybe_prepare_offline_gptq(x)
+                    kwargs.update({
+                        "gptq_qweight": self.gptq_qweight,
+                        "gptq_qzeros": self.gptq_qzeros,
+                        "gptq_scales": self.gptq_scales,
+                        "gptq_group_size": group_size,
+                    })
                     kwargs["gptq_g_idx"] = self.gptq_g_idx
+                elif weight_format == "gptq_marlin":
+                    self._maybe_prepare_offline_gptq_marlin(x)
+                    bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 0
+                    if bits <= 0:
+                        pack_factor = in_features // int(self.gptq_qweight.shape[0])
+                        bits = 32 // pack_factor
+                    kwargs["gptq_weight_bits"] = bits
+                    kwargs.update({
+                        "gptq_marlin_qweight": self.gptq_marlin_qweight,
+                        "gptq_marlin_scales": self.gptq_marlin_scales,
+                        "gptq_marlin_zp": self.gptq_marlin_zp,
+                        "gptq_marlin_g_idx": self.gptq_marlin_g_idx,
+                        "gptq_marlin_g_idx_sort_indices": self.gptq_marlin_g_idx_sort_indices,
+                        "gptq_marlin_workspace": self.gptq_marlin_workspace,
+                    })
+                else:
+                    raise RuntimeError(
+                        f"Offline GPTQ weights are present, but current strategy weight_format={weight_format!r} "
+                        "is not compatible."
+                    )
             elif format_val == 2:  # AWQ
-                kwargs.update({
-                    "awq_qweight": self.awq_qweight,
-                    "awq_qzeros": self.awq_qzeros,
-                    "awq_scales": self.awq_scales,
-                    "awq_group_size": group_size,
-                })
+                if weight_format == "awq":
+                    kwargs.update({
+                        "awq_qweight": self.awq_qweight,
+                        "awq_qzeros": self.awq_qzeros,
+                        "awq_scales": self.awq_scales,
+                        "awq_group_size": group_size,
+                    })
+                elif weight_format == "awq_marlin":
+                    self._maybe_prepare_offline_awq_marlin(x)
+                    kwargs.update({
+                        "awq_marlin_qweight": self.awq_marlin_qweight,
+                        "awq_marlin_scales": self.awq_marlin_scales,
+                        "awq_marlin_zp": self.awq_marlin_zp,
+                        "awq_marlin_workspace": self.awq_marlin_workspace,
+                        "awq_weight_bits": 4,
+                    })
+                else:
+                    raise RuntimeError(
+                        f"Offline AWQ weights are present, but current strategy weight_format={weight_format!r} "
+                        "is not compatible."
+                    )
             
             base_out = strategy.linear_forward(
                 x,
@@ -609,6 +1108,8 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         bias = self.bias if self.tp_rank == 0 else None
         strategy = get_linear_strategy(self.quant_kind)
+        # Runtime safety net: ensure we don't keep bf16+quant weights both resident.
+        self._maybe_promote_weight_to_quantized_at_runtime(x, strategy)
         
         # Check for offline quantized weights (GPTQ/AWQ) first
         if self.has_offline_quantized_weight():
@@ -618,6 +1119,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             out_features = int(self._offline_quant_out_features.item())
             in_features = int(self._offline_quant_in_features.item())
             group_size = int(self._offline_quant_group_size.item())
+            weight_format = getattr(strategy, "linear_weight_format", None)
             
             kwargs = {
                 "out_features": out_features,
@@ -626,21 +1128,59 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             }
             
             if format_val == 1:  # GPTQ
-                kwargs.update({
-                    "gptq_qweight": self.gptq_qweight,
-                    "gptq_qzeros": self.gptq_qzeros,
-                    "gptq_scales": self.gptq_scales,
-                    "gptq_group_size": group_size,
-                })
-                if self.gptq_g_idx.numel() > 0:
+                if weight_format == "gptq":
+                    # vLLM requires gptq_shuffle before first gptq_gemm.
+                    self._maybe_prepare_offline_gptq(x)
+                    kwargs.update({
+                        "gptq_qweight": self.gptq_qweight,
+                        "gptq_qzeros": self.gptq_qzeros,
+                        "gptq_scales": self.gptq_scales,
+                        "gptq_group_size": group_size,
+                    })
+                    # Always pass g_idx (can be empty); strategy will normalize dtype/device.
                     kwargs["gptq_g_idx"] = self.gptq_g_idx
+                elif weight_format == "gptq_marlin":
+                    self._maybe_prepare_offline_gptq_marlin(x)
+                    bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 0
+                    if bits <= 0:
+                        pack_factor = in_features // int(self.gptq_qweight.shape[0])
+                        bits = 32 // pack_factor
+                    kwargs["gptq_weight_bits"] = bits
+                    kwargs.update({
+                        "gptq_marlin_qweight": self.gptq_marlin_qweight,
+                        "gptq_marlin_scales": self.gptq_marlin_scales,
+                        "gptq_marlin_zp": self.gptq_marlin_zp,
+                        "gptq_marlin_g_idx": self.gptq_marlin_g_idx,
+                        "gptq_marlin_g_idx_sort_indices": self.gptq_marlin_g_idx_sort_indices,
+                        "gptq_marlin_workspace": self.gptq_marlin_workspace,
+                    })
+                else:
+                    raise RuntimeError(
+                        f"Offline GPTQ weights are present, but current strategy weight_format={weight_format!r} "
+                        "is not compatible."
+                    )
             elif format_val == 2:  # AWQ
-                kwargs.update({
-                    "awq_qweight": self.awq_qweight,
-                    "awq_qzeros": self.awq_qzeros,
-                    "awq_scales": self.awq_scales,
-                    "awq_group_size": group_size,
-                })
+                if weight_format == "awq":
+                    kwargs.update({
+                        "awq_qweight": self.awq_qweight,
+                        "awq_qzeros": self.awq_qzeros,
+                        "awq_scales": self.awq_scales,
+                        "awq_group_size": group_size,
+                    })
+                elif weight_format == "awq_marlin":
+                    self._maybe_prepare_offline_awq_marlin(x)
+                    kwargs.update({
+                        "awq_marlin_qweight": self.awq_marlin_qweight,
+                        "awq_marlin_scales": self.awq_marlin_scales,
+                        "awq_marlin_zp": self.awq_marlin_zp,
+                        "awq_marlin_workspace": self.awq_marlin_workspace,
+                        "awq_weight_bits": 4,
+                    })
+                else:
+                    raise RuntimeError(
+                        f"Offline AWQ weights are present, but current strategy weight_format={weight_format!r} "
+                        "is not compatible."
+                    )
             
             y = strategy.linear_forward(
                 x,
diff --git a/diffulex/utils/loader.py b/diffulex/utils/loader.py
index 7b2a151..fb608f9 100755
--- a/diffulex/utils/loader.py
+++ b/diffulex/utils/loader.py
@@ -12,6 +12,151 @@
 
 logger = get_logger(__name__)
 
+def _read_quantize_config(model_dir: str) -> dict:
+    """Read vLLM-style quantization metadata if present.
+
+    We use this to detect checkpoint formats like `gptq_marlin` which reuse the same
+    tensor keys (qweight/qzeros/scales[/g_idx]) but have different semantics.
+    """
+    cfg_path = os.path.join(model_dir, "quantize_config.json")
+    if not os.path.exists(cfg_path):
+        return {}
+    try:
+        with open(cfg_path, "r") as f:
+            data = json.load(f)
+        return data if isinstance(data, dict) else {}
+    except Exception:
+        return {}
+
+
+def _make_packed_qzeros_constant(
+    *,
+    num_groups: int,
+    out_features: int,
+    bits: int,
+    device: torch.device | str,
+) -> torch.Tensor:
+    """Create a GPTQ-style packed qzeros tensor filled with a constant.
+
+    For vLLM GPTQ v1 checkpoints, zeros are stored as (zeros - 1) and then bit-packed
+    along the output dimension (N). For symmetric quantization, zeros is typically
+    bias=2^(bits-1), thus stored constant becomes (2^(bits-1) - 1).
+
+    This is primarily used as a *shape-compatible dummy* when loading gptq_marlin
+    checkpoints where runtime zero-points are intentionally unused (qzeros may be empty).
+    """
+    if bits not in (2, 4, 8):
+        raise ValueError(f"Unsupported bits={bits} for packed qzeros (expected 2/4/8)")
+    pack_factor = 32 // bits
+    if out_features % pack_factor != 0:
+        raise ValueError(
+            f"out_features={out_features} not divisible by pack_factor={pack_factor} for bits={bits}"
+        )
+    out_packed = out_features // pack_factor
+
+    # Stored constant for GPTQ v1: bias - 1, where bias = 2^(bits-1).
+    z = (1 << (bits - 1)) - 1
+    packed_val = 0
+    for i in range(pack_factor):
+        packed_val |= (z & ((1 << bits) - 1)) << (bits * i)
+
+    return torch.full(
+        (int(num_groups), int(out_packed)),
+        int(packed_val),
+        dtype=torch.int32,
+        device=device,
+    )
+
+
+def _infer_module_device(module: nn.Module) -> torch.device:
+    w = getattr(module, "weight", None)
+    if isinstance(w, torch.Tensor):
+        return w.device
+    for p in module.parameters(recurse=False):
+        return p.device
+    for b in module.buffers(recurse=False):
+        return b.device
+    return torch.device("cpu")
+
+
+def _set_offline_gptq_marlin_weight(
+    module: nn.Module,
+    *,
+    qweight: torch.Tensor,
+    scales: torch.Tensor,
+    out_features: int,
+    in_features: int,
+    group_size: int,
+    bits: int,
+    g_idx: torch.Tensor | None,
+) -> None:
+    """Directly set GPTQ-Marlin-ready offline weights into a Diffulex Linear module.
+
+    This bypasses `set_offline_quantized_weight` because marlin-exported `scales`
+    use a different layout (e.g. (2*num_groups, out_features/2)) and would fail
+    the standard GPTQ shape validation.
+
+    We still populate minimal GPTQ metadata/buffers so Diffulex forward chooses
+    the offline path, and then `LinearBase._maybe_prepare_offline_gptq_marlin`
+    will only allocate workspace / g_idx metadata (and not repack/permute again).
+    """
+    module_device = _infer_module_device(module)
+    if qweight.device != module_device:
+        qweight = qweight.to(device=module_device)
+    if scales.device != module_device:
+        scales = scales.to(device=module_device)
+    if g_idx is not None and g_idx.device != module_device:
+        g_idx = g_idx.to(device=module_device)
+
+    pack_factor = 32 // int(bits)
+    group_size_norm = in_features if group_size == -1 else group_size
+    if group_size_norm <= 0 or in_features % group_size_norm != 0:
+        raise ValueError(f"Invalid group_size={group_size} for in_features={in_features}")
+    num_groups = in_features // group_size_norm
+
+    # Minimal qzeros to satisfy offline presence checks. (Marlin GPTQ symmetric doesn't use runtime zp.)
+    qzeros = _make_packed_qzeros_constant(
+        num_groups=num_groups,
+        out_features=out_features,
+        bits=int(bits),
+        device=module_device,
+    )
+
+    # Populate GPTQ buffers (note: scales here are marlin layout; gptq kernels should not be used).
+    module.gptq_qweight = qweight
+    module.gptq_qzeros = qzeros
+    module.gptq_scales = scales.to(dtype=torch.float16)
+    if g_idx is None:
+        module.gptq_g_idx = torch.empty(0, dtype=torch.int32, device=module_device)
+    else:
+        if getattr(g_idx, "numel", lambda: 1)() == 0:
+            module.gptq_g_idx = torch.empty(0, dtype=torch.int32, device=module_device)
+        else:
+            module.gptq_g_idx = g_idx.to(dtype=torch.int32)
+
+    # Also mark as marlin-ready so LinearBase won't repack/permute again.
+    module.gptq_marlin_qweight = qweight
+    module.gptq_marlin_scales = module.gptq_scales
+
+    module._offline_quant_format = torch.tensor(1, dtype=torch.int8, device=module_device)
+    module._offline_quant_bits = torch.tensor(int(bits), dtype=torch.int32, device=module_device)
+    module._offline_quant_group_size = torch.tensor(group_size, dtype=torch.int32, device=module_device)
+    module._offline_quant_out_features = torch.tensor(out_features, dtype=torch.int32, device=module_device)
+    module._offline_quant_in_features = torch.tensor(in_features, dtype=torch.int32, device=module_device)
+    module._gptq_is_shuffled = torch.tensor(False, dtype=torch.bool, device=module_device)
+
+    # Reset marlin-prep caches (workspace/zp/g_idx meta will be created on first forward).
+    module._gptq_marlin_is_prepared = torch.tensor(False, dtype=torch.bool, device=module_device)
+    module.gptq_marlin_zp = torch.empty(0, dtype=torch.int32, device=module_device)
+    module.gptq_marlin_g_idx = torch.empty(0, dtype=torch.int32, device=module_device)
+    module.gptq_marlin_g_idx_sort_indices = torch.empty(0, dtype=torch.int32, device=module_device)
+    module.gptq_marlin_workspace = torch.empty(0, dtype=torch.int32, device=module_device)
+
+    # Drop bf16 weight Parameter if present (to free memory and avoid accidental fallback).
+    if hasattr(module, "_parameters") and "weight" in module._parameters:
+        module._parameters.pop("weight", None)
+        setattr(module, "weight", None)
+
 
 def load_lora_config(lora_path: str) -> dict:
     """Load LoRA configuration from adapter_config.json."""
@@ -61,9 +206,22 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
     # Check if model is configured for GPTQ or AWQ
     weight_attn_dtype = getattr(config, "linear_attn_weight_dtype", "bf16") or "bf16"
     weight_mlp_dtype = getattr(config, "linear_mlp_weight_dtype", "bf16") or "bf16"
+    quantize_cfg = _read_quantize_config(getattr(config, "model", ""))
+    checkpoint_format = (quantize_cfg.get("checkpoint_format") or "").strip().lower()
+    ckpt_bits = int(quantize_cfg.get("bits", 0) or 0)
+    ckpt_group_size = int(quantize_cfg.get("group_size", 0) or 0)
     
-    use_gptq = weight_attn_dtype.lower() == "gptq" or weight_mlp_dtype.lower() == "gptq"
-    use_awq = weight_attn_dtype.lower() == "awq" or weight_mlp_dtype.lower() == "awq"
+    # NOTE: marlin variants reuse the same offline GPTQ/AWQ checkpoint keys
+    # (qweight/qzeros/scales[/g_idx]) and are repacked lazily in `LinearBase`
+    # on first forward.
+    gptq_dtypes = {"gptq", "gptq_marlin"}
+    awq_dtypes = {"awq", "awq_marlin"}
+    use_gptq = (weight_attn_dtype or "").lower() in gptq_dtypes or (weight_mlp_dtype or "").lower() in gptq_dtypes
+    use_awq = (weight_attn_dtype or "").lower() in awq_dtypes or (weight_mlp_dtype or "").lower() in awq_dtypes
+    want_gptq_marlin = (weight_attn_dtype or "").lower() == "gptq_marlin" or (weight_mlp_dtype or "").lower() == "gptq_marlin"
+    want_awq_marlin = (weight_attn_dtype or "").lower() == "awq_marlin" or (weight_mlp_dtype or "").lower() == "awq_marlin"
+    is_gptq_marlin_ckpt = checkpoint_format == "gptq_marlin"
+    is_awq_marlin_ckpt = checkpoint_format == "awq_marlin"
     
     if not (use_gptq or use_awq):
         return loaded_gptq, loaded_awq, skipped
@@ -145,13 +303,14 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
             
             # Determine format: check if g_idx exists (GPTQ) or not (AWQ)
             has_g_idx = "g_idx" in key_dict
-            if has_g_idx and use_gptq:
+            is_gptq_keyset = has_g_idx or is_gptq_marlin_ckpt
+            if is_gptq_keyset and use_gptq:
                 format = "gptq"
-            elif not has_g_idx and use_awq:
+            elif (not is_gptq_keyset) and use_awq:
                 format = "awq"
             else:
                 # Prefer GPTQ if both are enabled and g_idx exists
-                format = "gptq" if (use_gptq and has_g_idx) else ("awq" if use_awq else None)
+                format = "gptq" if (use_gptq and is_gptq_keyset) else ("awq" if use_awq else None)
             
             if format is None:
                 skipped += 1
@@ -183,47 +342,267 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
                 skipped += 1
                 continue
             
-            # Infer dimensions from tensor shapes
-            out_features, packed_in = qweight.shape
-            in_features = packed_in * 2  # Packed int4: 2 values per byte (max estimate)
-            # Refine in_features from scales shape if available
-            if scales.shape[1:] != ():
-                # scales is [num_groups, in_features] or [num_groups]
-                if len(scales.shape) == 2:
-                    in_features = scales.shape[1]
-            
-            # Default group_size for GPTQ/AWQ is 128
+            # Infer dimensions from tensor shapes (vLLM standard format) WITHOUT
+            # assuming bits=4. This enables GPTQ W2/W4/W8 checkpoints.
+            if format == "gptq":
+                if is_gptq_marlin_ckpt:
+                    # gptq_marlin export uses Marlin repacked qweight/scales layouts.
+                    # Empirically (vLLM marlin): qweight is packed on K in tiles of 16,
+                    # so qweight.shape[0] == in_features / 16; and scales carries original N.
+                    out_features = int(scales.shape[1]) if scales.ndim == 2 else int(qweight.shape[1])
+                    in_features = int(qweight.shape[0]) * 16
+                    if ckpt_bits not in (4, 8):
+                        print(
+                            f"Warning: gptq_marlin requires bits=4/8, got bits={ckpt_bits} for {module_name}. Skipping."
+                        )
+                        skipped += 1
+                        continue
+                    # Keep pack_factor for dummy qzeros creation later.
+                    pack_factor = 32 // int(ckpt_bits)
+                else:
+                    # Standard GPTQ: qweight [K/pack, N]
+                    out_features = int(qweight.shape[1])
+                    # qzeros: [K/group, N/pack] (may be empty for some checkpoints)
+                    if getattr(qzeros, "numel", lambda: 1)() == 0:
+                        if ckpt_bits not in (2, 4, 8):
+                            print(
+                                f"Warning: qzeros is empty and cannot infer bits for {module_name}. "
+                                f"Please ensure quantize_config.json contains bits (2/4/8). Skipping."
+                            )
+                            skipped += 1
+                            continue
+                        pack_factor = 32 // int(ckpt_bits)
+                    else:
+                        if int(qzeros.shape[1]) <= 0 or out_features % int(qzeros.shape[1]) != 0:
+                            print(
+                                f"Warning: Cannot infer GPTQ pack_factor from qzeros for {module_name}: "
+                                f"qzeros.shape={tuple(qzeros.shape)}, qweight.shape={tuple(qweight.shape)}. Skipping."
+                            )
+                            skipped += 1
+                            continue
+                        pack_factor = out_features // int(qzeros.shape[1])  # 32 / bits
+                    in_features = int(qweight.shape[0]) * pack_factor
+            else:
+                # awq: qweight: [K, N/pack], scales: [K/group, N]
+                out_features = int(scales.shape[1]) if scales.ndim == 2 else int(qweight.shape[1])
+                if int(qweight.shape[1]) <= 0 or out_features % int(qweight.shape[1]) != 0:
+                    print(
+                        f"Warning: Cannot infer AWQ pack_factor from scales/qweight for {module_name}: "
+                        f"scales.shape={tuple(scales.shape)}, qweight.shape={tuple(qweight.shape)}. Skipping."
+                    )
+                    skipped += 1
+                    continue
+                pack_factor = out_features // int(qweight.shape[1])  # 32 / bits (expected 8 for AWQ 4-bit)
+                in_features = int(qweight.shape[0])
+
+            # Infer group_size from qzeros/scales.
+            # qzeros/scales are groupwise on K (in_features).
             group_size = 128
-            # Infer group_size from scales/qzeros shape
-            num_groups = qzeros.shape[0]
-            if num_groups > 0:
-                estimated_group_size = (out_features + num_groups - 1) // num_groups
-                if estimated_group_size > 0:
-                    group_size = estimated_group_size
+            if ckpt_group_size not in (0, None):
+                # quantize_config.json stores actual group_size (may be -1)
+                group_size = int(ckpt_group_size)
+            else:
+                if is_gptq_marlin_ckpt and len(scales.shape) == 2 and int(scales.shape[0]) > 0:
+                    # marlin scales often use first dim = 2 * num_groups
+                    num_groups = int(scales.shape[0]) // 2
+                    if num_groups > 0 and in_features % num_groups == 0:
+                        group_size = in_features // num_groups
+                else:
+                    num_groups = int(qzeros.shape[0]) if getattr(qzeros, "numel", lambda: 1)() > 0 else 0
+                    if num_groups > 0 and in_features % num_groups == 0:
+                        group_size = in_features // num_groups
+                    elif len(scales.shape) == 2 and int(scales.shape[0]) > 0 and in_features % int(scales.shape[0]) == 0:
+                        group_size = in_features // int(scales.shape[0])
+
+            # For gptq_marlin checkpoints qzeros may be empty; create a shape-compatible dummy
+            # packed qzeros so LinearBase considers offline weights present.
+            if (
+                format == "gptq"
+                and getattr(qzeros, "numel", lambda: 1)() == 0
+                and (want_gptq_marlin or is_gptq_marlin_ckpt)
+                and ckpt_bits in (2, 4, 8)
+            ):
+                group_size_norm = in_features if group_size == -1 else group_size
+                if group_size_norm <= 0 or (in_features % group_size_norm) != 0:
+                    print(
+                        f"Warning: Invalid group_size={group_size} for {module_name} with in_features={in_features}. "
+                        "Skipping."
+                    )
+                    skipped += 1
+                    continue
+                num_groups = in_features // group_size_norm
+                try:
+                    qzeros = _make_packed_qzeros_constant(
+                        num_groups=num_groups,
+                        out_features=out_features,
+                        bits=int(ckpt_bits),
+                        device=qweight.device,
+                    )
+                except Exception as e:
+                    print(f"Warning: Failed to create dummy qzeros for {module_name}: {e}. Skipping.")
+                    skipped += 1
+                    continue
             
-            # Handle tensor parallel: if tp_size > 1, we need to handle sharding
-            # For MVP, only support TP=1 (tensor_parallel_size=1)
-            tp_size = getattr(module, "tp_size", 1)
+            # Handle tensor parallel sharding (TP>1).
+            # ColumnParallelLinear: tp_dim=0 (shard N/out_features)
+            # RowParallelLinear   : tp_dim=1 (shard K/in_features)
+            tp_size = int(getattr(module, "tp_size", 1) or 1)
+            tp_rank = int(getattr(module, "tp_rank", 0) or 0)
+            tp_dim = getattr(module, "tp_dim", None)
             if tp_size > 1:
-                print(
-                    f"Warning: Tensor parallel (TP={tp_size}) is not fully supported for offline quantized weights. "
-                    f"Skipping {module_name}. Please provide a TP=1 checkpoint or implement TP sharding logic."
-                )
-                skipped += 1
-                continue
+                if tp_dim not in (0, 1):
+                    print(
+                        f"Warning: Unsupported tp_dim={tp_dim} for offline quantized weights. "
+                        f"Skipping {module_name}."
+                    )
+                    skipped += 1
+                    continue
+
+                # Shard along output features (N) for column-parallel modules.
+                if tp_dim == 0:
+                    if out_features % tp_size != 0:
+                        print(
+                            f"Warning: out_features={out_features} not divisible by TP={tp_size} for {module_name}. "
+                            "Skipping offline quant weights for this module."
+                        )
+                        skipped += 1
+                        continue
+                    out_per = out_features // tp_size
+                    out_start = tp_rank * out_per
+                    out_end = out_start + out_per
+                    if out_per % pack_factor != 0:
+                        print(
+                            f"Warning: out_features_per_partition={out_per} not divisible by pack_factor={pack_factor} "
+                            f"for {module_name}. Skipping."
+                        )
+                        skipped += 1
+                        continue
+                    out_packed_per = out_per // pack_factor
+                    out_packed_start = out_start // pack_factor
+                    out_packed_end = out_packed_start + out_packed_per
+
+                    if format == "gptq":
+                        if is_gptq_marlin_ckpt:
+                            # Marlin qweight packs N by a factor (bits/2): N_packed = N * (bits/2)
+                            n_factor = int(ckpt_bits) // 2
+                            if n_factor <= 0:
+                                print(f"Warning: invalid gptq_marlin n_factor for bits={ckpt_bits} ({module_name}). Skipping.")
+                                skipped += 1
+                                continue
+                            qweight = qweight[:, (out_start * n_factor):(out_end * n_factor)]
+                            # scales keep original N
+                            scales = scales[:, out_start:out_end]
+                            # qzeros stays dummy/empty; g_idx stays on K.
+                            out_features = out_per
+                        else:
+                            # qweight: [K/pack, N]
+                            qweight = qweight[:, out_start:out_end]
+                            # qzeros: [K/group, N/pack]
+                            qzeros = qzeros[:, out_packed_start:out_packed_end]
+                            # scales: [K/group, N]
+                            scales = scales[:, out_start:out_end]
+                            out_features = out_per
+                    else:
+                        # awq qweight: [K, N/pack]
+                        qweight = qweight[:, out_packed_start:out_packed_end]
+                        qzeros = qzeros[:, out_packed_start:out_packed_end]
+                        scales = scales[:, out_start:out_end]
+                        out_features = out_per
+
+                # Shard along input features (K) for row-parallel modules.
+                elif tp_dim == 1:
+                    if in_features % tp_size != 0:
+                        print(
+                            f"Warning: in_features={in_features} not divisible by TP={tp_size} for {module_name}. "
+                            "Skipping offline quant weights for this module."
+                        )
+                        skipped += 1
+                        continue
+                    in_per = in_features // tp_size
+                    in_start = tp_rank * in_per
+                    in_end = in_start + in_per
+                    if group_size <= 0 or (in_per % group_size) != 0 or (in_start % group_size) != 0:
+                        print(
+                            f"Warning: group_size={group_size} incompatible with TP sharding for {module_name} "
+                            f"(in_per={in_per}, in_start={in_start}). Skipping."
+                        )
+                        skipped += 1
+                        continue
+                    g_start = in_start // group_size
+                    g_end = in_end // group_size
+
+                    if format == "gptq":
+                        if is_gptq_marlin_ckpt:
+                            # Marlin qweight packs K in tiles of 16: K_packed = K / 16
+                            if in_start % 16 != 0:
+                                print(
+                                    f"Warning: gptq_marlin requires in_start divisible by 16, got in_start={in_start} "
+                                    f"for {module_name}. Skipping."
+                                )
+                                skipped += 1
+                                continue
+                            q_start = in_start // 16
+                            q_end = in_end // 16
+                            qweight = qweight[q_start:q_end, :]
+                            # scales first dim is typically 2*num_groups
+                            scales = scales[(2 * g_start):(2 * g_end), :]
+                            if g_idx is not None and getattr(g_idx, "numel", lambda: 1)() > 0:
+                                g_idx = g_idx[in_start:in_end]
+                            in_features = in_per
+                        else:
+                            # qweight: [K/pack, N] (packed on K)
+                            if in_start % pack_factor != 0:
+                                print(
+                                    f"Warning: in_start={in_start} not divisible by pack_factor={pack_factor} "
+                                    f"for {module_name}. Skipping."
+                                )
+                                skipped += 1
+                                continue
+                            q_start = in_start // pack_factor
+                            q_end = in_end // pack_factor
+                            qweight = qweight[q_start:q_end, :]
+                            qzeros = qzeros[g_start:g_end, :]
+                            scales = scales[g_start:g_end, :]
+                            if g_idx is not None and getattr(g_idx, "numel", lambda: 1)() > 0:
+                                g_idx = g_idx[in_start:in_end]
+                            in_features = in_per
+                    else:
+                        # awq qweight: [K, N/pack]
+                        qweight = qweight[in_start:in_end, :]
+                        qzeros = qzeros[g_start:g_end, :]
+                        scales = scales[g_start:g_end, :]
+                        in_features = in_per
             
+            # Treat empty g_idx as "not provided" for GPTQ (desc_act=False checkpoints often store empty).
+            if g_idx is not None and getattr(g_idx, "numel", lambda: 1)() == 0:
+                g_idx = None
+
             # Set offline quantized weight
             try:
-                module.set_offline_quantized_weight(
-                    format=format,
-                    qweight=qweight,
-                    qzeros=qzeros,
-                    scales=scales,
-                    out_features=out_features,
-                    in_features=in_features,
-                    group_size=group_size,
-                    g_idx=g_idx,
-                )
+                if format == "gptq" and is_gptq_marlin_ckpt:
+                    if ckpt_bits not in (4, 8):
+                        raise ValueError(f"gptq_marlin checkpoint requires bits=4/8, got bits={ckpt_bits}")
+                    _set_offline_gptq_marlin_weight(
+                        module,
+                        qweight=qweight,
+                        scales=scales,
+                        out_features=out_features,
+                        in_features=in_features,
+                        group_size=group_size,
+                        bits=int(ckpt_bits),
+                        g_idx=g_idx,
+                    )
+                else:
+                    module.set_offline_quantized_weight(
+                        format=format,
+                        qweight=qweight,
+                        qzeros=qzeros,
+                        scales=scales,
+                        out_features=out_features,
+                        in_features=in_features,
+                        group_size=group_size,
+                        g_idx=g_idx,
+                    )
                 if format == "gptq":
                     loaded_gptq += 1
                 else:
diff --git a/diffulex/utils/quantization/quantize_model.py b/diffulex/utils/quantization/quantize_model.py
index b82710f..bd77977 100644
--- a/diffulex/utils/quantization/quantize_model.py
+++ b/diffulex/utils/quantization/quantize_model.py
@@ -1,15 +1,16 @@
 #!/usr/bin/env python3
-"""离线量化脚本：将模型权重量化为 GPTQ/AWQ 格式
+"""离线量化脚本：将模型权重量化为 vLLM 标准 GPTQ/AWQ 格式
 
-支持两种量化格式：
-- GPTQ: Groupwise quantization with optional g_idx
-- AWQ: Groupwise quantization (no g_idx)
+支持两种量化格式（对齐 vLLM 权重格式）：
+- GPTQ: qweight/qzeros 为 int32 packed，scales 为 fp16，g_idx 可选（常见 desc_act=False 时为空）
+- GPTQ_MARLIN: 导出 Marlin-ready 的 GPTQ 权重布局（qweight 已 repack，scales 已 permute，zp 为空）
+- AWQ : qweight/qzeros 为 int32 packed，scales 为 fp16
 
 使用方法:
     python -m diffulex.utils.quantization.quantize_model \
         --model-path /path/to/model \
         --output-path /path/to/output \
-        --quant-format gptq \
+        --quant-format gptq_marlin \
         --group-size 128 \
         --bits 4
 """
@@ -41,193 +42,179 @@
 from glob import glob
 
 
-def _pack_int4_to_int8(int4_tensor: torch.Tensor) -> torch.Tensor:
-    """Pack int4 tensor into int8 format.
-    
-    Args:
-        int4_tensor: int8 tensor [N, K] with values in [-8, 7]
-        
-    Returns:
-        packed: int8 tensor [N, (K + 1) // 2] with 2 int4 values per byte
-    """
-    out_features, in_features = int4_tensor.shape
-    
-    # Clamp to int4 range [-8, 7]
-    int4_tensor = int4_tensor.clamp(-8, 7)
-    
-    # Convert to unsigned: [-8, 7] -> [0, 15]
-    uint8_tensor = (int4_tensor + 8).to(torch.uint8)
-    
-    # Pad to even number of columns if needed
-    if in_features % 2 != 0:
-        pad_size = 1
-        padding = torch.zeros(out_features, pad_size, dtype=torch.uint8, device=uint8_tensor.device) + 8
-        uint8_tensor = torch.cat([uint8_tensor, padding], dim=1)
-        padded_in_features = in_features + pad_size
-    else:
-        padded_in_features = in_features
-    
-    # Reshape to [N, K//2, 2] where first column is even indices, second is odd indices
-    reshaped = uint8_tensor.view(out_features, padded_in_features // 2, 2)
-    
-    # Pack: lower 4 bits = even columns, upper 4 bits = odd columns
-    packed = reshaped[:, :, 0] | (reshaped[:, :, 1] << 4)
-    return packed.to(torch.int8)
+def _require_vllm():
+    try:
+        from vllm.scalar_type import scalar_types  # type: ignore
+        from vllm.model_executor.layers.quantization.utils.quant_utils import (  # type: ignore
+            awq_pack,
+            gptq_pack,
+            pack_cols,
+            quantize_weights,
+        )
+    except Exception as e:  # pragma: no cover
+        raise RuntimeError(
+            "离线 GPTQ/AWQ 打包已切换到 vLLM 标准格式，需要可 import 的 vLLM。"
+        ) from e
+    return scalar_types, quantize_weights, gptq_pack, awq_pack, pack_cols
 
 
-def _quantize_gptq_groupwise(
-    weight: torch.Tensor,
-    group_size: int = 128,
-    bits: int = 4,
-    g_idx: Optional[torch.Tensor] = None,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-    """Quantize weight using GPTQ groupwise quantization.
-    
-    Args:
-        weight: float32 tensor [out_features, in_features]
-        group_size: Group size for quantization (default: 128)
-        bits: Number of bits per weight (default: 4)
-        g_idx: Optional int32 tensor [out_features] mapping each output channel to its group.
-               If None, uses sequential grouping: group_id = out_idx // group_size
-    
-    Returns:
-        qweight: int8 packed int4 weights [out_features, (in_features + 1) // 2]
-        qzeros: int8 packed int4 zeros [num_groups, (in_features + 1) // 2]
-        scales: float32 per-group scales [num_groups, in_features]
-        g_idx: int32 tensor [out_features] group indices (always returned, even if input was None)
+def _require_vllm_marlin():
+    # Marlin 预处理依赖 CUDA custom ops
+    try:
+        from vllm import _custom_ops as ops  # type: ignore
+        from vllm.model_executor.layers.quantization.utils.marlin_utils import (  # type: ignore
+            marlin_permute_scales,
+        )
+    except Exception as e:  # pragma: no cover
+        raise RuntimeError(
+            "导出 gptq_marlin 格式需要可 import 的 vLLM Marlin（含 CUDA custom ops）。"
+        ) from e
+    return ops, marlin_permute_scales
+
+
+def _quantize_to_vllm_gptq(
+    weight: torch.Tensor, *, group_size: int, bits: int, use_v2_format: bool = False
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Quantize and pack weights into vLLM GPTQ checkpoint format.
+
+    Input:
+      weight: fp32 [N, K] (PyTorch Linear weight)
+    Output (vLLM format):
+      qweight: int32 [K/pack, N]
+      qzeros : int32 [K/group, N/pack]   (GPTQ v1 stores (zeros - 1); v2 stores zeros)
+      scales : fp16  [K/group, N]
+      g_idx  : int32 empty tensor (desc_act=False)
     """
-    out_features, in_features = weight.shape
-    device = weight.device
-    
-    # Determine group assignments
-    if g_idx is None:
-        # Sequential grouping: group_id = out_idx // group_size
-        group_ids = torch.arange(out_features, device=device) // group_size
-    else:
-        # Use provided g_idx
-        if g_idx.shape != (out_features,):
-            raise ValueError(f"g_idx shape mismatch: got {g_idx.shape}, expected ({out_features},)")
-        group_ids = g_idx.to(device=device).to(torch.int64)
-    
-    num_groups = int(group_ids.max().item() + 1)
-    
-    # Quantize per group
-    qweight_list = []
-    qzeros_list = []
-    scales_list = []
-    
-    for g in range(num_groups):
-        # Get output channels in this group
-        group_mask = (group_ids == g)
-        group_indices = torch.where(group_mask)[0]
-        
-        if len(group_indices) == 0:
-            continue
-            
-        group_weight = weight[group_indices]  # [group_out_size, in_features]
-        group_out_size = group_weight.shape[0]
-        
-        # Compute scale and zero point per input feature (per-channel within group)
-        # For GPTQ, we use per-channel quantization within each group
-        abs_max = torch.abs(group_weight).max(dim=0, keepdim=True)[0]  # [1, in_features]
-        scales_group = (abs_max.clamp(min=1e-8) / (2 ** (bits - 1) - 1)).squeeze(0)  # [in_features]
-        
-        # Compute zero point: mean of group (per-channel)
-        zeros_group = group_weight.mean(dim=0)  # [in_features]
-        
-        # Quantize: (weight - zero) / scale
-        quantized_group = ((group_weight - zeros_group.unsqueeze(0)) / scales_group.unsqueeze(0).clamp(min=1e-8))
-        quantized_group = quantized_group.round().clamp(-2 ** (bits - 1), 2 ** (bits - 1) - 1).to(torch.int8)
-        
-        # Pack quantized weights
-        packed_group = _pack_int4_to_int8(quantized_group)  # [group_out_size, (in_features + 1) // 2]
-        qweight_list.append(packed_group)
-        
-        # Quantize and pack zeros
-        zeros_quantized = (zeros_group / scales_group.clamp(min=1e-8)).round().clamp(-2 ** (bits - 1), 2 ** (bits - 1) - 1).to(torch.int8)
-        zeros_packed = _pack_int4_to_int8(zeros_quantized.unsqueeze(0))  # [1, (in_features + 1) // 2]
-        qzeros_list.append(zeros_packed)
-        
-        # Store scales
-        scales_list.append(scales_group.unsqueeze(0))  # [1, in_features]
-    
-    # Concatenate all groups
-    qweight = torch.cat(qweight_list, dim=0)  # [out_features, (in_features + 1) // 2]
-    qzeros = torch.cat(qzeros_list, dim=0)  # [num_groups, (in_features + 1) // 2]
-    scales = torch.cat(scales_list, dim=0)  # [num_groups, in_features]
-    
-    # Ensure g_idx is returned (create if was None)
-    if g_idx is None:
-        g_idx = group_ids.to(torch.int32)
-    else:
-        g_idx = g_idx.to(torch.int32)
-    
+    scalar_types, quantize_weights, gptq_pack, _, pack_cols = _require_vllm()
+    # vLLM GPTQConfig mentions 2/3/4/8, but the standard vLLM int32 packing
+    # used by `gptq_pack/pack_cols` requires 32 % bits == 0.
+    # So we support 2/4/8 here; 3-bit would need a different packing scheme.
+    if bits not in (2, 4, 8):
+        raise ValueError(
+            f"GPTQ bits 仅支持 2/4/8（vLLM 标准 int32 pack 要求 32%bits==0），当前 bits={bits}"
+        )
+
+    # vLLM operates on (K, N)
+    w = weight.T.contiguous()
+    size_k, size_n = w.shape
+    group_size_norm = size_k if group_size == -1 else group_size
+    if group_size_norm <= 0 or size_k % group_size_norm != 0:
+        raise ValueError(f"Invalid group_size={group_size} for in_features={size_k}")
+
+    if bits == 2:
+        quant_type = scalar_types.uint2b2
+    elif bits == 4:
+        quant_type = scalar_types.uint4b8
+    else:  # bits == 8
+        quant_type = scalar_types.uint8b128
+
+    _, w_q, w_s, _ = quantize_weights(w, quant_type, group_size_norm, zero_points=False)
+
+    pack_factor = 32 // bits
+    qweight = gptq_pack(w_q, bits, size_k, size_n).contiguous()  # [K/pack, N]
+
+    num_groups = size_k // group_size_norm
+    zeros = torch.full(
+        (num_groups, size_n),
+        int(getattr(quant_type, "bias", 0)),
+        dtype=torch.int32,
+        device=w.device,
+    )
+    # GPTQ v1 stores zeros-1 in the checkpoint.
+    zeros_to_store = zeros if use_v2_format else (zeros - 1)
+    qzeros = pack_cols(zeros_to_store, bits, num_groups, size_n).contiguous()  # [K/group, N/pack]
+
+    scales = w_s.to(torch.float16).contiguous()  # [K/group, N]
+    g_idx = torch.empty((0,), dtype=torch.int32, device=w.device)
     return qweight, qzeros, scales, g_idx
 
 
-def _quantize_awq_groupwise(
-    weight: torch.Tensor,
-    group_size: int = 128,
-    bits: int = 4,
+def _quantize_to_vllm_gptq_marlin(
+    weight: torch.Tensor, *, group_size: int, bits: int
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Quantize weights and export marlin-ready GPTQ layout.
+
+    该导出格式对齐 vLLM `MarlinLinearKernel.process_weights_after_loading` 的结果：
+    - qweight: 已执行 `gptq_marlin_repack`
+    - scales : 已执行 `marlin_permute_scales`
+    - qzeros : 置空（Marlin GPTQ symmetric 路径不使用 runtime zp）
+    - g_idx  : 空（desc_act=False）
+
+    注意：需要在 CUDA 上执行（`gptq_marlin_repack` 为 CUDA op）。
+    """
+    if weight.device.type != "cuda":
+        raise ValueError("gptq_marlin 导出需要 device=cuda（Marlin repack 为 CUDA op）")
+
+    ops, marlin_permute_scales = _require_vllm_marlin()
+
+    # 先按 vLLM 标准 GPTQ（symmetric, zero_points=False）量化并打包
+    qweight, _qzeros, scales, g_idx = _quantize_to_vllm_gptq(
+        weight, group_size=group_size, bits=bits, use_v2_format=False
+    )
+
+    # vLLM GPTQ packing 的 shape 基于 w=(K,N)；这里 size_k=in_features, size_n=out_features
+    size_k = weight.shape[1]
+    size_n = weight.shape[0]
+    group_size_norm = size_k if group_size == -1 else group_size
+
+    # desc_act=False 时 perm 为空
+    empty_perm = torch.empty((0,), dtype=torch.int32, device=weight.device)
+
+    marlin_qweight = ops.gptq_marlin_repack(
+        qweight.contiguous(),
+        perm=empty_perm,
+        size_k=size_k,
+        size_n=size_n,
+        num_bits=bits,
+        is_a_8bit=False,
+    ).contiguous()
+
+    marlin_scales = marlin_permute_scales(
+        scales.contiguous(),
+        size_k=size_k,
+        size_n=size_n,
+        group_size=group_size_norm,
+        is_a_8bit=False,
+    ).contiguous()
+
+    # Marlin GPTQ symmetric 不使用 runtime zero points，导出空 qzeros 保持一致性
+    marlin_qzeros = torch.empty((0,), dtype=torch.int32, device=weight.device)
+    marlin_g_idx = g_idx  # already empty
+
+    return marlin_qweight, marlin_qzeros, marlin_scales, marlin_g_idx
+
+
+def _quantize_to_vllm_awq(
+    weight: torch.Tensor, *, group_size: int, bits: int
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Quantize weight using AWQ groupwise quantization.
-    
-    Args:
-        weight: float32 tensor [out_features, in_features]
-        group_size: Group size for quantization (default: 128)
-        bits: Number of bits per weight (default: 4)
-    
-    Returns:
-        qweight: int8 packed int4 weights [out_features, (in_features + 1) // 2]
-        qzeros: int8 packed int4 zeros [num_groups, (in_features + 1) // 2]
-        scales: float32 per-group scales [num_groups, in_features] or [num_groups]
+    """Quantize and pack weights into vLLM AWQ checkpoint format.
+
+    Input:
+      weight: fp32 [N, K]
+    Output (vLLM format):
+      qweight: int32 [K, N/pack]
+      qzeros : int32 [K/group, N/pack]
+      scales : fp16  [K/group, N]
     """
-    out_features, in_features = weight.shape
-    device = weight.device
-    
-    num_groups = (out_features + group_size - 1) // group_size
-    
-    # Quantize per group (sequential grouping)
-    qweight_list = []
-    qzeros_list = []
-    scales_list = []
-    
-    for g in range(num_groups):
-        start_idx = g * group_size
-        end_idx = min((g + 1) * group_size, out_features)
-        group_weight = weight[start_idx:end_idx]  # [group_size (or remainder), in_features]
-        group_out_size = group_weight.shape[0]
-        
-        # AWQ: Compute scale per group (can be scalar or per-channel)
-        # For simplicity, use per-channel scales within group
-        abs_max = torch.abs(group_weight).max(dim=0, keepdim=True)[0]  # [1, in_features]
-        scales_group = (abs_max.clamp(min=1e-8) / (2 ** (bits - 1) - 1)).squeeze(0)  # [in_features]
-        
-        # AWQ: Compute zero point per input channel (per-channel)
-        # Use minimum value for better quantization range
-        zeros_group = group_weight.min(dim=0)[0]  # [in_features]
-        
-        # Quantize: (weight - zero) / scale
-        quantized_group = ((group_weight - zeros_group.unsqueeze(0)) / scales_group.unsqueeze(0).clamp(min=1e-8))
-        quantized_group = quantized_group.round().clamp(-2 ** (bits - 1), 2 ** (bits - 1) - 1).to(torch.int8)
-        
-        # Pack quantized weights
-        packed_group = _pack_int4_to_int8(quantized_group)  # [group_out_size, (in_features + 1) // 2]
-        qweight_list.append(packed_group)
-        
-        # Quantize and pack zeros
-        zeros_quantized = (zeros_group / scales_group.clamp(min=1e-8)).round().clamp(-2 ** (bits - 1), 2 ** (bits - 1) - 1).to(torch.int8)
-        zeros_packed = _pack_int4_to_int8(zeros_quantized.unsqueeze(0))  # [1, (in_features + 1) // 2]
-        qzeros_list.append(zeros_packed)
-        
-        # Store scales
-        scales_list.append(scales_group.unsqueeze(0))  # [1, in_features]
-    
-    # Concatenate all groups
-    qweight = torch.cat(qweight_list, dim=0)  # [out_features, (in_features + 1) // 2]
-    qzeros = torch.cat(qzeros_list, dim=0)  # [num_groups, (in_features + 1) // 2]
-    scales = torch.cat(scales_list, dim=0)  # [num_groups, in_features]
-    
+    scalar_types, quantize_weights, _, awq_pack, _ = _require_vllm()
+    if bits != 4:
+        raise ValueError(f"AWQ 目前仅支持 4-bit，当前 bits={bits}")
+
+    w = weight.T.contiguous()
+    size_k, size_n = w.shape
+    group_size_norm = size_k if group_size == -1 else group_size
+    if group_size_norm <= 0 or size_k % group_size_norm != 0:
+        raise ValueError(f"Invalid group_size={group_size} for in_features={size_k}")
+
+    quant_type = scalar_types.uint4
+    _, w_q, w_s, w_zp = quantize_weights(w, quant_type, group_size_norm, zero_points=True)
+    if w_zp is None:
+        raise RuntimeError("AWQ zero_points=True 但未生成 zero points，vLLM 量化返回异常。")
+
+    qweight = awq_pack(w_q, bits, size_k, size_n).contiguous()  # [K, N/pack]
+    num_groups = size_k // group_size_norm
+    qzeros = awq_pack(w_zp.to(torch.int32), bits, num_groups, size_n).contiguous()  # [K/group, N/pack]
+    scales = w_s.to(torch.float16).contiguous()  # [K/group, N]
     return qweight, qzeros, scales
 
 
@@ -252,8 +239,10 @@ def quantize_model(
                        If None, quantizes all linear layers.
         device: Device to use for quantization ("cpu" or "cuda")
     """
-    if quant_format not in ["gptq", "awq"]:
-        raise ValueError(f"Unsupported quant_format: {quant_format}. Must be 'gptq' or 'awq'")
+    if quant_format not in ["gptq", "gptq_marlin", "awq"]:
+        raise ValueError(
+            f"Unsupported quant_format: {quant_format}. Must be 'gptq', 'gptq_marlin' or 'awq'"
+        )
     
     output_path = Path(output_path)
     output_path.mkdir(parents=True, exist_ok=True)
@@ -327,29 +316,27 @@ def quantize_model(
         weight_fp32 = weight.to(torch.float32).to(device)
         
         # Quantize
+        prefix = key[:-7]  # Remove ".weight"
         if quant_format == "gptq":
-            qweight, qzeros, scales, g_idx = _quantize_gptq_groupwise(
-                weight_fp32, group_size=group_size, bits=bits, g_idx=None
+            qweight, qzeros, scales, g_idx = _quantize_to_vllm_gptq(
+                weight_fp32, group_size=group_size, bits=bits, use_v2_format=False
+            )
+        elif quant_format == "gptq_marlin":
+            qweight, qzeros, scales, g_idx = _quantize_to_vllm_gptq_marlin(
+                weight_fp32, group_size=group_size, bits=bits
             )
-            # Save quantized weights with module prefix
-            prefix = key[:-7]  # Remove ".weight"
             quantized_weights[f"{prefix}.qweight"] = qweight.cpu()
             quantized_weights[f"{prefix}.qzeros"] = qzeros.cpu()
             quantized_weights[f"{prefix}.scales"] = scales.cpu()
+            # Keep g_idx key for compatibility (often empty when desc_act=False).
             quantized_weights[f"{prefix}.g_idx"] = g_idx.cpu()
-            quantized_weights[f"{prefix}.group_size"] = torch.tensor(group_size, dtype=torch.int32)
-            quantized_weights[f"{prefix}.bits"] = torch.tensor(bits, dtype=torch.int32)
         else:  # awq
-            qweight, qzeros, scales = _quantize_awq_groupwise(
+            qweight, qzeros, scales = _quantize_to_vllm_awq(
                 weight_fp32, group_size=group_size, bits=bits
             )
-            # Save quantized weights with module prefix
-            prefix = key[:-7]  # Remove ".weight"
             quantized_weights[f"{prefix}.qweight"] = qweight.cpu()
             quantized_weights[f"{prefix}.qzeros"] = qzeros.cpu()
             quantized_weights[f"{prefix}.scales"] = scales.cpu()
-            quantized_weights[f"{prefix}.group_size"] = torch.tensor(group_size, dtype=torch.int32)
-            quantized_weights[f"{prefix}.bits"] = torch.tensor(bits, dtype=torch.int32)
         
         metadata["quantized_modules"].append({
             "name": prefix,
@@ -391,6 +378,20 @@ def quantize_model(
     metadata_file = output_path / f"quantization_metadata_{quant_format}.json"
     with open(metadata_file, "w") as f:
         json.dump(metadata, f, indent=2)
+
+    # vLLM GPTQ/GPTQ-Marlin 会读取 quantize_config.json
+    # - gptq_marlin: 需要 sym/desc_act 等字段用于识别并选择 Marlin kernel
+    if quant_format == "gptq_marlin":
+        quantize_cfg = {
+            "bits": int(bits),
+            "group_size": int(group_size),
+            "desc_act": False,
+            "sym": True,
+            "lm_head": False,
+            "checkpoint_format": "gptq_marlin",
+        }
+        with open(output_path / "quantize_config.json", "w") as f:
+            json.dump(quantize_cfg, f, indent=2)
     
     print(f"\n✓ Quantization complete!")
     print(f"  - Quantized {len(metadata['quantized_modules'])} modules")
@@ -408,7 +409,13 @@ def main():
     )
     parser.add_argument("--model-path", type=str, required=True, help="输入模型路径")
     parser.add_argument("--output-path", type=str, required=True, help="输出路径")
-    parser.add_argument("--quant-format", type=str, choices=["gptq", "awq"], default="gptq", help="量化格式: gptq 或 awq")
+    parser.add_argument(
+        "--quant-format",
+        type=str,
+        choices=["gptq", "gptq_marlin", "awq"],
+        default="gptq",
+        help="量化格式: gptq / gptq_marlin / awq",
+    )
     parser.add_argument("--group-size", type=int, default=128, help="量化组大小 (默认: 128)")
     parser.add_argument("--bits", type=int, default=4, help="每个权重的位数 (默认: 4)")
     parser.add_argument("--target-modules", type=str, help="要量化的模块名称模式（逗号分隔），例如: q_proj,k_proj,v_proj")
diff --git a/diffulex/utils/quantization/registry.py b/diffulex/utils/quantization/registry.py
index eec11ea..0b7be63 100644
--- a/diffulex/utils/quantization/registry.py
+++ b/diffulex/utils/quantization/registry.py
@@ -84,10 +84,14 @@ def _normalize_linear_dtype(dtype: str) -> str:
         "e5m2": "fp8_e5m2",
         # Weight-only methods (placeholders)
         "gptq": "gptq",
+        "gptq_marlin": "gptq_marlin",
+        "gptq_marlin_24": "gptq_marlin_24",
         "awq": "awq",
+        "awq_marlin": "awq_marlin",
         "gptq_awq": "gptq_awq",
-        # vLLM-style fused W8A16 path (Diffulex vendored): user-facing alias "marlin"
-        # Normalized key is "marlin_int8" to avoid conflating with other quant methods.
+        # vLLM-style fused W8A16 path (AllSpark): keep user-facing alias "marlin"
+        # for backward compatibility. Normalized key is "marlin_int8" to avoid
+        # conflating with other quant methods.
         "marlin": "marlin_int8",
         "marlin_int8": "marlin_int8",
     }
@@ -150,6 +154,19 @@ def create_linear_strategy(*, weight_dtype: str, act_dtype: str) -> LinearQuanti
 def registered_linear_dtypes() -> list[str]:
     """Return the normalized dtype/method names accepted by `_normalize_linear_dtype`."""
     # Keep this list stable for CLI/help messages.
-    return ["bf16", "int8", "int4", "fp8_e4m3", "fp8_e5m2", "gptq", "awq", "gptq_awq", "marlin_int8"]
+    return [
+        "bf16",
+        "int8",
+        "int4",
+        "fp8_e4m3",
+        "fp8_e5m2",
+        "gptq",
+        "gptq_marlin",
+        "gptq_marlin_24",
+        "awq",
+        "awq_marlin",
+        "gptq_awq",
+        "marlin_int8",
+    ]
 
 
diff --git a/diffulex/utils/quantization/strategies/__init__.py b/diffulex/utils/quantization/strategies/__init__.py
index d7cd5c1..1fcc216 100644
--- a/diffulex/utils/quantization/strategies/__init__.py
+++ b/diffulex/utils/quantization/strategies/__init__.py
@@ -8,14 +8,15 @@
 from diffulex.utils.quantization.strategies.linear_bf16 import LinearBF16Strategy
 from diffulex.utils.quantization.strategies.linear_stub import LinearStubStrategy
 from diffulex.utils.quantization.strategies.linear_int8_w8a16 import LinearInt8W8A16Strategy  # noqa: F401
-from diffulex.utils.quantization.strategies.linear_marlin_int8_w8a16 import LinearMarlinInt8W8A16Strategy  # noqa: F401
 from diffulex.utils.quantization.strategies.linear_int4_w4a16 import LinearInt4W4A16Strategy  # noqa: F401
 from diffulex.utils.quantization.strategies.linear_int8_w8a8 import LinearInt8W8A8Strategy  # noqa: F401
 from diffulex.utils.quantization.strategies.linear_int4_w4a8 import LinearInt4W4A8Strategy  # noqa: F401
 from diffulex.utils.quantization.strategies.linear_fp8_w8a16 import LinearFP8W8A16Strategy  # noqa: F401
 from diffulex.utils.quantization.strategies.linear_fp8_w8a8 import LinearFP8W8A8Strategy  # noqa: F401
 from diffulex.utils.quantization.strategies.linear_gptq_w4a16 import LinearGPTQW4A16Strategy  # noqa: F401
+from diffulex.utils.quantization.strategies.linear_gptq_marlin_w4a16 import LinearGPTQMarlinW4A16Strategy  # noqa: F401
 from diffulex.utils.quantization.strategies.linear_awq_w4a16 import LinearAWQW4A16Strategy  # noqa: F401
+from diffulex.utils.quantization.strategies.linear_awq_marlin_w4a16 import LinearAWQMarlinW4A16Strategy  # noqa: F401
 
 __all__ = [
     'NoQuantizationStrategy',
@@ -24,13 +25,14 @@
     'LinearBF16Strategy',
     'LinearStubStrategy',
     'LinearInt8W8A16Strategy',
-    'LinearMarlinInt8W8A16Strategy',
     'LinearInt4W4A16Strategy',
     'LinearInt8W8A8Strategy',
     'LinearInt4W4A8Strategy',
     'LinearFP8W8A16Strategy',
     'LinearFP8W8A8Strategy',
     'LinearGPTQW4A16Strategy',
+    'LinearGPTQMarlinW4A16Strategy',
     'LinearAWQW4A16Strategy',
+    'LinearAWQMarlinW4A16Strategy',
 ]
 
diff --git a/diffulex/utils/quantization/strategies/linear_awq_marlin_w4a16.py b/diffulex/utils/quantization/strategies/linear_awq_marlin_w4a16.py
new file mode 100644
index 0000000..be9389f
--- /dev/null
+++ b/diffulex/utils/quantization/strategies/linear_awq_marlin_w4a16.py
@@ -0,0 +1,123 @@
+"""
+AWQ Marlin (W4, A16) Linear strategy using vLLM Marlin CUDA kernels.
+
+- Input activations: bf16 (cast to fp16 for vLLM marlin kernel)
+- Weights: offline AWQ vLLM standard format (qweight/qzeros/scales)
+- One-time repack/permutation is performed by Diffulex `LinearBase` and passed in via kwargs:
+  - awq_marlin_qweight / awq_marlin_scales / awq_marlin_zp
+  - awq_marlin_workspace
+"""
+
+from __future__ import annotations
+
+from typing import Any, Optional
+
+import torch
+
+from diffulex.utils.quantization.registry import register_linear_strategy
+from diffulex.utils.quantization.strategy import LinearQuantizationStrategy
+
+try:
+    from vllm.model_executor.layers.quantization.utils.marlin_utils import (  # type: ignore
+        apply_awq_marlin_linear,
+        marlin_make_empty_g_idx,
+        marlin_permute_bias,
+    )
+    from vllm.scalar_type import scalar_types  # type: ignore
+except Exception:  # pragma: no cover
+    apply_awq_marlin_linear = None  # type: ignore
+    marlin_make_empty_g_idx = None  # type: ignore
+    marlin_permute_bias = None  # type: ignore
+    scalar_types = None  # type: ignore
+
+
+@register_linear_strategy(weight_dtype="awq_marlin", act_dtype="bf16")
+def _build_linear_awq_marlin_w4a16() -> LinearQuantizationStrategy:
+    return LinearAWQMarlinW4A16Strategy()
+
+
+class LinearAWQMarlinW4A16Strategy(LinearQuantizationStrategy):
+    @property
+    def name(self) -> str:
+        return "linear_awq_marlin_w4a16"
+
+    @property
+    def linear_weight_format(self) -> str:
+        return "awq_marlin"
+
+    @property
+    def linear_act_format(self) -> str:
+        return "bf16"
+
+    def get_storage_dtype(self) -> tuple[torch.dtype, int]:
+        return torch.int32, 4
+
+    def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs) -> tuple[int, ...]:
+        # Same as AWQ: [K/group, N]
+        if len(original_shape) != 2:
+            raise ValueError(f"Expected 2D weight shape, got {original_shape}")
+        out_features, in_features = original_shape
+        group_size = int(kwargs.get("group_size", 128))
+        group_size = in_features if group_size == -1 else group_size
+        if group_size <= 0 or in_features % group_size != 0:
+            raise ValueError(f"Invalid group_size={group_size} for in_features={in_features}")
+        num_groups = in_features // group_size
+        return (num_groups, out_features)
+
+    def quantize(self, tensor: torch.Tensor, **kwargs) -> tuple[torch.Tensor, Any]:
+        return tensor, {}
+
+    def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) -> torch.Tensor:
+        if quantized.is_floating_point():
+            return quantized
+        raise NotImplementedError("AWQ Marlin 不提供 Python dequantize；请使用 vLLM Marlin CUDA kernel。")
+
+    def linear_forward(
+        self,
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        bias: Optional[torch.Tensor],
+        *,
+        quant_kind: str,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        _ = quant_kind, weight
+        if apply_awq_marlin_linear is None or scalar_types is None:
+            raise RuntimeError("awq_marlin 需要 vLLM (marlin_utils + scalar_types)；当前环境不可用。")
+
+        qweight = kwargs.get("awq_marlin_qweight", None)
+        scales = kwargs.get("awq_marlin_scales", None)
+        zp = kwargs.get("awq_marlin_zp", None)
+        workspace = kwargs.get("awq_marlin_workspace", None)
+        in_features = int(kwargs.get("in_features", 0))
+        out_features = int(kwargs.get("out_features", 0))
+
+        if any(t is None for t in (qweight, scales, zp, workspace)) or in_features <= 0 or out_features <= 0:
+            raise RuntimeError("awq_marlin: missing prepared marlin tensors (qweight/scales/zp/workspace).")
+
+        # vLLM marlin kernels expect FP16 activations.
+        x_in = x.to(dtype=torch.float16) if x.dtype != torch.float16 else x
+
+        # AWQ marlin does not use g_idx.
+        empty = marlin_make_empty_g_idx(x.device) if marlin_make_empty_g_idx is not None else torch.empty((0,), device=x.device, dtype=torch.int32)
+
+        marlin_bias = None
+        if bias is not None:
+            marlin_bias = marlin_permute_bias(bias) if marlin_permute_bias is not None else bias
+
+        out = apply_awq_marlin_linear(
+            input=x_in,
+            weight=qweight,
+            weight_scale=scales,
+            weight_zp=zp,
+            g_idx=empty,
+            g_idx_sort_indices=empty,
+            workspace=workspace,
+            quant_type=scalar_types.uint4,
+            output_size_per_partition=out_features,
+            input_size_per_partition=in_features,
+            bias=marlin_bias,
+            input_dtype=None,
+        )
+        return out.to(dtype=x.dtype) if out.dtype != x.dtype else out
+
diff --git a/diffulex/utils/quantization/strategies/linear_awq_w4a16.py b/diffulex/utils/quantization/strategies/linear_awq_w4a16.py
index 4d314a1..488176e 100644
--- a/diffulex/utils/quantization/strategies/linear_awq_w4a16.py
+++ b/diffulex/utils/quantization/strategies/linear_awq_w4a16.py
@@ -1,11 +1,11 @@
 """
-AWQ W4A16 Linear quantization strategy (AWQ weight + bf16 activation).
+AWQ W4A16 Linear quantization strategy (vLLM standard format).
 
-Implementation notes:
-- Weight quantization: AWQ format with groupwise quantization
-- Activation: kept as bf16 (no activation quantization)
-- Storage: AWQ uses packed int4 weights (qweight), int4 zeros (qzeros), and per-group scales
-- Forward path: Dequantize AWQ weights to bf16, then use F.linear
+- Weight format: vLLM AWQ (packed int32 qweight/qzeros + fp16 scales)
+- Activation: bf16 (no activation quantization)
+- Forward: vLLM custom op `awq_gemm` (with the same heuristic as vLLM)
+
+No TileLang dependency.
 """
 
 from __future__ import annotations
@@ -18,161 +18,10 @@
 from diffulex.utils.quantization.registry import register_linear_strategy
 from diffulex.utils.quantization.strategy import LinearQuantizationStrategy
 
-# Try to import TileLang kernel, fallback to None if not available
-_TILELANG_AVAILABLE = False
-try:
-    from diffulex_kernel.python.linear_kernels import awq_w4a16_gemm
-    _TILELANG_AVAILABLE = True
-except ImportError:
-    awq_w4a16_gemm = None
-
 try:
-    from diffulex.attention.metadata import is_warming_up
-    from tilelang.autotuner import set_autotune_inputs
-    _AUTOTUNE_AVAILABLE = True
-except ImportError:
-    _AUTOTUNE_AVAILABLE = False
-    is_warming_up = lambda: False
-    set_autotune_inputs = lambda *args, **kwargs: lambda f: f
-
-
-def _unpack_awq_int4(
-    packed: torch.Tensor,
-    *,
-    out_features: int,
-    in_features: int,
-) -> torch.Tensor:
-    """Unpack AWQ packed int4 weights into int8 values.
-
-    AWQ packs 2 int4 values per int8 byte:
-    - Lower 4 bits: even columns
-    - Upper 4 bits: odd columns
-
-    Args:
-        packed: int8 tensor [out_features, (in_features + 1) // 2]
-        out_features: Original output features
-        in_features: Original input features
-
-    Returns:
-        unpacked: int8 tensor [out_features, in_features] with values in [-8, 7]
-    """
-    if packed.dtype != torch.int8:
-        raise TypeError(f"packed weight must be int8, got {packed.dtype}")
-
-    out_features_actual, packed_in = packed.shape
-    expected_packed_in = (in_features + 1) // 2
-    if packed_in != expected_packed_in:
-        raise ValueError(
-            f"Packed input dimension mismatch: got {packed_in}, "
-            f"expected {expected_packed_in} for in_features={in_features}"
-        )
-    if out_features_actual != out_features:
-        raise ValueError(
-            f"Output dimension mismatch: got {out_features_actual}, "
-            f"expected {out_features}"
-        )
-
-    # Interpret bytes as uint8 for bit manipulation
-    p_u8 = packed.view(torch.uint8)
-    # Extract lower and upper 4 bits
-    low_u8 = (p_u8 & 0x0F)  # [0..15]
-    high_u8 = ((p_u8 >> 4) & 0x0F)  # [0..15]
-
-    # Convert unsigned nibble [0..15] to signed int4 [-8..7]
-    # Packing: int4 [-8, 7] + 8 -> uint8 [0, 15]
-    # Unpacking: uint8 [0, 15] - 8 -> int4 [-8, 7]
-    low_s = low_u8.to(torch.int16) - 8
-    high_s = high_u8.to(torch.int16) - 8
-
-    # Interleave low/high along in_features
-    unpacked = torch.empty((out_features, packed_in * 2), device=packed.device, dtype=torch.int16)
-    unpacked[:, 0::2] = low_s
-    unpacked[:, 1::2] = high_s
-    unpacked = unpacked[:, :in_features].to(torch.int8)
-    return unpacked
-
-
-def _dequantize_awq(
-    qweight: torch.Tensor,
-    qzeros: torch.Tensor,
-    scales: torch.Tensor,
-    *,
-    out_features: int,
-    in_features: int,
-    group_size: int = 128,
-) -> torch.Tensor:
-    """Dequantize AWQ weights to bf16.
-
-    AWQ uses groupwise quantization:
-    - Weight is quantized per group (group_size consecutive output channels)
-    - Each group has its own scale and zero point
-    - AWQ does not use g_idx (sequential grouping)
-
-    Args:
-        qweight: int8 tensor [out_features, (in_features + 1) // 2] packed int4
-        qzeros: int8 tensor [(out_features + group_size - 1) // group_size, (in_features + 1) // 2] packed int4
-        scales: float32 tensor [(out_features + group_size - 1) // group_size, in_features] or [num_groups]
-        out_features: Output features
-        in_features: Input features
-        group_size: Group size for quantization (default: 128)
-
-    Returns:
-        dequantized: bf16 tensor [out_features, in_features]
-    """
-    device = qweight.device
-
-    # Unpack qweight to int8 [out_features, in_features]
-    w_int8 = _unpack_awq_int4(qweight, out_features=out_features, in_features=in_features)
-
-    # Unpack qzeros to int8 [num_groups, in_features]
-    num_groups = (out_features + group_size - 1) // group_size
-    if qzeros.shape[0] != num_groups:
-        raise ValueError(
-            f"qzeros shape mismatch: got {qzeros.shape[0]} groups, "
-            f"expected {num_groups} for out_features={out_features}, group_size={group_size}"
-        )
-    zeros_int8 = _unpack_awq_int4(qzeros, out_features=num_groups, in_features=in_features)
-
-    # Ensure scales have correct shape [num_groups, in_features]
-    if scales.shape == (num_groups,):
-        # Broadcast per-group scales to all input features
-        scales = scales.unsqueeze(-1).expand(num_groups, in_features)  # [num_groups, in_features]
-    elif scales.shape == (num_groups, 1):
-        scales = scales.expand(num_groups, in_features)  # [num_groups, in_features]
-    elif scales.shape != (num_groups, in_features):
-        raise ValueError(
-            f"scales shape mismatch: got {scales.shape}, "
-            f"expected ({num_groups}, {in_features}), ({num_groups},), or ({num_groups}, 1)"
-        )
-
-    # Convert to float32 for dequantization
-    w_fp32 = w_int8.to(torch.float32)
-    zeros_int8_fp32 = zeros_int8.to(torch.float32)  # Quantized zeros (int8)
-    scales_fp32 = scales.to(torch.float32)
-    
-    # Dequantize zeros: zero = zero_quantized * scale
-    # zeros_int8 was quantized as: zero_quantized = round(zero / scale)
-    # So to recover: zero = zero_quantized * scale
-    zeros_fp32 = zeros_int8_fp32 * scales_fp32  # [num_groups, in_features]
-
-    # Dequantize: (weight - zero) * scale
-    # AWQ uses sequential grouping: group_id = out_idx // group_size
-    group_ids = torch.arange(out_features, device=device) // group_size  # [out_features]
-    group_ids = group_ids.unsqueeze(-1)  # [out_features, 1]
-
-    # Gather zeros and scales for each output channel
-    zeros_for_channel = torch.gather(
-        zeros_fp32, 0, group_ids.expand(-1, in_features)
-    )  # [out_features, in_features]
-    scales_for_channel = torch.gather(
-        scales_fp32, 0, group_ids.expand(-1, in_features)
-    )  # [out_features, in_features]
-
-    # Dequantize: quantized * scale + zero
-    # Quantization formula: quantized = round((weight - zero) / scale)
-    # Dequantization formula: weight = quantized * scale + zero
-    dequantized = w_fp32 * scales_for_channel + zeros_for_channel
-    return dequantized.to(torch.bfloat16)
+    from vllm import _custom_ops as ops  # type: ignore
+except Exception:  # pragma: no cover
+    ops = None  # type: ignore
 
 
 @register_linear_strategy(weight_dtype="awq", act_dtype="bf16")
@@ -181,21 +30,6 @@ def _build_linear_awq_w4a16() -> LinearQuantizationStrategy:
 
 
 class LinearAWQW4A16Strategy(LinearQuantizationStrategy):
-    """AWQ W4A16 Linear strategy: AWQ weight quantization + bf16 activation.
-
-    Current implementation: Python reference using dequantized weights + F.linear.
-    Weight quantization: AWQ format with groupwise quantization (typically group_size=128).
-    Activation: kept as bf16 (no activation quantization).
-
-    Lazy cache: Dequantized weights are cached to avoid re-dequantizing on every forward pass.
-    """
-
-    def __init__(self):
-        """Initialize strategy (no cache needed when using kernel)."""
-        super().__init__()
-        # TileLang autotune config cache: (device, M_bucket, N, K, num_groups, group_size) -> config dict
-        self._tl_autotune_config_cache: dict[tuple[str, int, int, int, int, int], dict] = {}
-
     @property
     def name(self) -> str:
         return "linear_awq_w4a16"
@@ -209,99 +43,33 @@ def linear_act_format(self) -> str:
         return "bf16"
 
     def get_storage_dtype(self) -> tuple[torch.dtype, int]:
-        # AWQ weights are stored as packed int8 (2 int4 per byte)
-        return torch.int8, 1
+        # vLLM AWQ stores packed weights in int32.
+        return torch.int32, 4
 
     def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs) -> tuple[int, ...]:
-        """Return shape of scales tensor for AWQ groupwise quantization.
-
-        For [out_features, in_features] weight with group_size groups:
-        - scales shape is [(out_features + group_size - 1) // group_size, in_features]
-          or [(out_features + group_size - 1) // group_size] (broadcasted)
-        """
-        if len(original_shape) < 2:
-            raise ValueError(f"Expected weight shape with at least 2 dims, got {original_shape}")
-        out_features, in_features = original_shape[0], original_shape[1]
-        group_size = kwargs.get("group_size", 128)
-        num_groups = (out_features + group_size - 1) // group_size
-        return (num_groups, in_features)
-
-    def quantize(self, tensor: torch.Tensor, **kwargs):
-        """AWQ quantization is typically done offline, so this is a placeholder."""
+        # vLLM AWQ scales: [K/group, N], where Linear weight is (N, K).
+        if len(original_shape) != 2:
+            raise ValueError(f"Expected 2D weight shape, got {original_shape}")
+        out_features, in_features = original_shape
+        group_size = int(kwargs.get("group_size", 128))
+        group_size = in_features if group_size == -1 else group_size
+        if group_size <= 0 or in_features % group_size != 0:
+            raise ValueError(f"Invalid group_size={group_size} for in_features={in_features}")
+        num_groups = in_features // group_size
+        return (num_groups, out_features)
+
+    def quantize(self, tensor: torch.Tensor, **kwargs) -> tuple[torch.Tensor, Any]:
+        # Offline AWQ is handled by `diffulex.utils.quantization.quantize_model`.
+        return tensor, {}
+
+    def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) -> torch.Tensor:
+        if quantized.is_floating_point():
+            return quantized
         raise NotImplementedError(
-            "AWQ quantization should be done offline using AWQ tools. "
-            "This strategy only supports loading pre-quantized weights."
+            "AWQ dequantize is not implemented in Diffulex. "
+            "Use vLLM kernels via linear_forward."
         )
 
-    def dequantize(
-        self,
-        quantized: torch.Tensor,
-        scale_or_metadata: Any,
-        **kwargs
-    ) -> torch.Tensor:
-        """Dequantize AWQ weights.
-
-        Args:
-            quantized: Not used (kept for interface compatibility)
-            scale_or_metadata: Dict with keys:
-                - 'qweight': int8 packed int4 weights
-                - 'qzeros': int8 packed int4 zeros
-                - 'scales': float32 per-group scales
-                - 'out_features': int
-                - 'in_features': int
-                - 'group_size': int (default: 128)
-            **kwargs: Additional arguments
-
-        Returns:
-            Dequantized tensor in bf16
-        """
-        if not isinstance(scale_or_metadata, dict):
-            raise ValueError(
-                "AWQ dequantize requires dict metadata with keys: "
-                "qweight, qzeros, scales, out_features, in_features, group_size (optional)"
-            )
-
-        qweight = scale_or_metadata["qweight"]
-        qzeros = scale_or_metadata["qzeros"]
-        scales = scale_or_metadata["scales"]
-        out_features = scale_or_metadata["out_features"]
-        in_features = scale_or_metadata["in_features"]
-        group_size = scale_or_metadata.get("group_size", 128)
-
-        return _dequantize_awq(
-            qweight=qweight,
-            qzeros=qzeros,
-            scales=scales,
-            out_features=out_features,
-            in_features=in_features,
-            group_size=group_size,
-        )
-
-    def quantize_weight_for_kernel(
-        self,
-        weight: torch.Tensor,
-        *,
-        device: torch.device | None = None,
-        **kwargs: Any,
-    ) -> tuple[torch.Tensor, Any]:
-        """AWQ quantization is done offline, so this should not be called."""
-        raise NotImplementedError(
-            "AWQ quantization should be done offline. "
-            "Use set_offline_quantized_weight() to load pre-quantized weights."
-        )
-
-    def quantize_act_for_kernel(
-        self,
-        x: torch.Tensor,
-        *,
-        device: torch.device | None = None,
-        **kwargs: Any,
-    ) -> tuple[torch.Tensor, Any]:
-        """No activation quantization for W4A16 (activation stays bf16)."""
-        if device is not None:
-            x = x.to(device=device)
-        return x, None
-
     def linear_forward(
         self,
         x: torch.Tensor,
@@ -311,199 +79,44 @@ def linear_forward(
         quant_kind: str,
         **kwargs: Any,
     ) -> torch.Tensor:
-        """Compute Linear output using AWQ quantized weights (W4A16).
-
-        Args:
-            x: Activation tensor [M, K] (bf16)
-            weight: Either bf16 weight [N, K] (fallback) or AWQ metadata dict
-            bias: Optional bias tensor [N]
-            quant_kind: Quantization kind (unused)
-            **kwargs: May include:
-                - awq_qweight: int8 packed int4 weights [N, (K+1)//2]
-                - awq_qzeros: int8 packed int4 zeros [num_groups, (K+1)//2]
-                - awq_scales: float32 scales [num_groups, K] or [num_groups]
-                - awq_group_size: int (default: 128)
-                - out_features: int (N)
-                - in_features: int (K)
-        """
-        _ = quant_kind
-
-        # Check if AWQ tensors are provided directly via kwargs
-        qweight = kwargs.pop("awq_qweight", None)
-        qzeros = kwargs.pop("awq_qzeros", None)
-        scales = kwargs.pop("awq_scales", None)
-        group_size = kwargs.pop("awq_group_size", 128)
-        out_features = kwargs.pop("out_features", None)
-        in_features = kwargs.pop("in_features", None)
-
-        # If AWQ tensors are provided, use them
-        if qweight is not None and qzeros is not None and scales is not None:
-            if out_features is None or in_features is None:
-                # Infer from x shape
-                M, K = x.shape
-                if in_features is None:
-                    in_features = K
-                if out_features is None:
-                    # Infer from qweight shape
-                    out_features = qweight.shape[0]
-
-            M, K = x.shape
-            N = out_features
-            num_groups = (N + group_size - 1) // group_size
-
-            # Handle scales shape: broadcast to [num_groups, in_features] if needed
-            if scales.shape == (num_groups,):
-                scales = scales.unsqueeze(-1).expand(num_groups, in_features)
-            elif scales.shape == (num_groups, 1):
-                scales = scales.expand(num_groups, in_features)
-            elif scales.shape != (num_groups, in_features):
-                raise ValueError(
-                    f"scales shape mismatch: got {scales.shape}, "
-                    f"expected ({num_groups}, {in_features}), ({num_groups},), or ({num_groups}, 1)"
-                )
-
-            # Ensure all tensors are on the correct device
-            qweight = qweight.to(device=x.device)
-            qzeros = qzeros.to(device=x.device)
-            scales = scales.to(device=x.device, dtype=torch.float32)
-
-            # Try to use TileLang kernel if available
-            if _TILELANG_AVAILABLE and awq_w4a16_gemm is not None:
-                try:
-                    # Check device
-                    if x.device.type != 'cuda':
-                        return self._fallback_python_forward(
-                            x, qweight, qzeros, scales, bias,
-                            out_features=N, in_features=in_features,
-                            group_size=group_size,
-                        )
-
-                    # M-bucketing: reduce JIT compilation churn
-                    M_bucket = M
-                    if M > 1:
-                        if M <= 64:
-                            M_bucket = 1 << (M - 1).bit_length()
-                        else:
-                            M_bucket = ((M + 63) // 64) * 64
-
-                    x_for_kernel = x
-                    if M_bucket != M:
-                        x_pad = torch.zeros((M_bucket, K), device=x.device, dtype=x.dtype)
-                        x_pad[:M, :] = x
-                        x_for_kernel = x_pad
-
-                    # TileLang autotune: use warmup + config cache pattern
-                    cache_key = (str(x.device), M_bucket, N, K, num_groups, group_size)
-                    config = self._tl_autotune_config_cache.get(cache_key)
-                    
-                    if _AUTOTUNE_AVAILABLE and is_warming_up() and config is None:
-                        # Warmup phase: run autotune with real inputs
-                        try:
-                            with set_autotune_inputs([x_for_kernel, qweight, qzeros, scales]):
-                                kernel = awq_w4a16_gemm(M_bucket, N, K, num_groups, group_size)
-                            config = kernel.config
-                            self._tl_autotune_config_cache[cache_key] = config
-                        except Exception:
-                            # Fallback to default config if autotune fails
-                            config = None
-                    
-                    # Use cached config or default parameters
-                    if config is not None:
-                        kernel = awq_w4a16_gemm(M_bucket, N, K, num_groups, group_size, **config)
-                    else:
-                        # Default config (backward compatible)
-                        kernel = awq_w4a16_gemm(M_bucket, N, K, num_groups, group_size, block_M=64, block_N=64, block_K=128, num_stages=2, threads=128)
-
-                    # Call kernel - out_idx=[4] means output is the 5th parameter
-                    output_full = kernel(x_for_kernel, qweight, qzeros, scales)
-                    output = output_full[:M, :] if M_bucket != M else output_full
+        _ = quant_kind, weight
+        if ops is None:
+            raise RuntimeError(
+                "vLLM is required for AWQ W4A16 (missing `vllm._custom_ops`). "
+                "Please install/build vLLM with CUDA ops."
+            )
 
-                    # Add bias if present
-                    if bias is not None:
-                        output = output + bias
+        qweight = kwargs.get("awq_qweight", None)
+        qzeros = kwargs.get("awq_qzeros", None)
+        scales = kwargs.get("awq_scales", None)
 
-                    return output
-                except Exception as e:
-                    # Fallback to Python implementation on any error
-                    import warnings
-                    error_msg = str(e)
+        if qweight is None or qzeros is None or scales is None:
+            return F.linear(x, weight, bias)
 
-                    # Extract meaningful error information
-                    if 'sm_' in error_msg and ('not defined' in error_msg or 'fatal' in error_msg):
-                        # CUDA architecture not supported - silently fallback
-                        pass
-                    elif 'Compilation error' in error_msg:
-                        # Extract the actual error
-                        idx = error_msg.find('Compilation error')
-                        after = error_msg[idx + len('Compilation error'):]
-                        lines = after.split('\n')
-                        for line in lines:
-                            line = line.strip()
-                            if line and not line.startswith('#') and ('error:' in line.lower() or 'fatal' in line.lower()):
-                                error_msg = f"CUDA compilation error: {line[:200]}"
-                                break
-                        else:
-                            error_msg = "CUDA compilation error (see logs for details)"
-                        warnings.warn(
-                            f"TileLang AWQ kernel failed, falling back to Python implementation: {error_msg}",
-                            UserWarning,
-                        )
-                    elif 'pipeline' in error_msg.lower() and 'stage' in error_msg.lower():
-                        # Pipeline stages mismatch - silently fallback
-                        pass
-                    else:
-                        # Warn for unexpected errors
-                        if len(error_msg) > 200:
-                            error_msg = error_msg[:200] + "..."
-                        warnings.warn(
-                            f"TileLang AWQ kernel failed, falling back to Python implementation: {error_msg}",
-                            UserWarning,
-                        )
-                    return self._fallback_python_forward(
-                        x, qweight, qzeros, scales, bias,
-                        out_features=N, in_features=in_features,
-                        group_size=group_size,
-                    )
-            else:
-                # TileLang not available, use Python fallback
-                return self._fallback_python_forward(
-                    x, qweight, qzeros, scales, bias,
-                    out_features=N, in_features=in_features,
-                    group_size=group_size,
-                )
+        # Infer pack_factor from packed shapes to avoid hard-coding 4-bit.
+        # AWQ: qweight [K, N/pack], scales [K/group, N]
+        if scales.ndim != 2 or scales.shape[1] <= 0:
+            raise RuntimeError(f"Invalid AWQ scales shape: {tuple(scales.shape)}")
+        if qweight.shape[1] <= 0 or int(scales.shape[1]) % int(qweight.shape[1]) != 0:
+            raise RuntimeError(
+                f"Invalid AWQ packed shapes: qweight.shape={tuple(qweight.shape)}, "
+                f"scales.shape={tuple(scales.shape)}"
+            )
+        pack_factor = int(scales.shape[1]) // int(qweight.shape[1])
+        # vLLM AWQ kernels expect FP16 activations.
+        x_in = x.to(dtype=torch.float16) if x.dtype != torch.float16 else x
+        qweight = qweight.to(device=x.device, dtype=torch.int32)
+        qzeros = qzeros.to(device=x.device, dtype=torch.int32)
+        scales = scales.to(device=x.device, dtype=torch.float16)
 
-        # Fallback: if weight is a regular bf16 tensor, use it directly
-        if isinstance(weight, torch.Tensor) and weight.dtype == torch.bfloat16:
-            return F.linear(x, weight, bias)
+        out_shape = x.shape[:-1] + (qweight.shape[-1] * pack_factor,)
+        reshaped_x = x_in.reshape(-1, x_in.shape[-1])
 
-        raise ValueError(
-            "AWQ strategy requires awq_qweight, awq_qzeros, and awq_scales to be provided "
-            "via kwargs or weight must be a bf16 tensor (fallback mode)"
-        )
+        # Always use awq_gemm to avoid large temporary dequantized weight allocations.
+        out = ops.awq_gemm(reshaped_x, qweight, scales, qzeros, pack_factor)
 
-    def _fallback_python_forward(
-        self,
-        x: torch.Tensor,
-        qweight: torch.Tensor,
-        qzeros: torch.Tensor,
-        scales: torch.Tensor,
-        bias: Optional[torch.Tensor],
-        *,
-        out_features: int,
-        in_features: int,
-        group_size: int,
-    ) -> torch.Tensor:
-        """Fallback Python implementation: dequantize + F.linear."""
-        dequant_weight = _dequantize_awq(
-            qweight=qweight.to(device=x.device),
-            qzeros=qzeros.to(device=x.device),
-            scales=scales.to(device=x.device),
-            out_features=out_features,
-            in_features=in_features,
-            group_size=group_size,
-        )
-        return F.linear(x, dequant_weight, bias)
+        if bias is not None:
+            out.add_(bias.to(dtype=out.dtype))
+        out = out.reshape(out_shape)
+        return out.to(dtype=x.dtype) if out.dtype != x.dtype else out
 
-    def clear_cache(self) -> None:
-        """Clear cache (no-op, kept for compatibility)."""
-        pass
diff --git a/diffulex/utils/quantization/strategies/linear_fp8_w8a16.py b/diffulex/utils/quantization/strategies/linear_fp8_w8a16.py
index 2e2cf1f..85048d8 100644
--- a/diffulex/utils/quantization/strategies/linear_fp8_w8a16.py
+++ b/diffulex/utils/quantization/strategies/linear_fp8_w8a16.py
@@ -1,12 +1,13 @@
 """
-FP8 W8A16 Linear quantization strategy (FP8 weight + bf16 activation).
+FP8 W8A16 Linear quantization strategy (FP8 weight + bf16 activation), TileLang-free.
 
-Implementation notes:
-- Weight quantization: per-output-channel FP8 quantization (fp8_e4m3 or fp8_e5m2)
-- Activation: kept as bf16 (no activation quantization)
-- Storage: FP8 weights use uint8 storage + view(fp8_dtype) pattern
-- Scale management: per-channel weight scales (shape: [out_features]), dtype: float32
-- Forward path: Python fallback (dequantize FP8 weight → bf16, then F.linear)
+vLLM-aligned implementation:
+- Weight quantization: `vllm._custom_ops.scaled_fp8_quant` (FP8 weight + per-tensor scale).
+- Forward: use vLLM's `Fp8LinearOp` (CUTLASS scaled_mm when available).
+
+Note:
+- vLLM 的 FP8 linear 核心路径以 e4m3 为主（由 vLLM 当前平台决定的 fp8 dtype）。
+- 为了避免“静默走慢路径”，这里不再使用 `F.linear` 的反量化 GEMM。
 """
 
 from __future__ import annotations
@@ -14,40 +15,9 @@
 from typing import Any, Optional
 
 import torch
-import torch.nn.functional as F
 
 from diffulex.utils.quantization.registry import register_linear_strategy
 from diffulex.utils.quantization.strategy import LinearQuantizationStrategy
-from diffulex.utils.quantization.kv_cache_dtype import (
-    parse_kv_cache_dtype,
-    _get_fp8_e4m3_dtype,
-    _get_fp8_e5m2_dtype,
-)
-
-# Try to import TileLang kernels, fallback to None if not available
-_TILELANG_AVAILABLE = False
-_fp8_e4m3_w8a16_gemm = None
-_fp8_e5m2_w8a16_gemm = None
-
-try:
-    from diffulex_kernel.python.linear_kernels import (
-        fp8_e4m3_w8a16_gemm,
-        fp8_e5m2_w8a16_gemm,
-    )
-    _TILELANG_AVAILABLE = True
-    _fp8_e4m3_w8a16_gemm = fp8_e4m3_w8a16_gemm
-    _fp8_e5m2_w8a16_gemm = fp8_e5m2_w8a16_gemm
-except ImportError:
-    pass
-
-try:
-    from diffulex.attention.metadata import is_warming_up
-    from tilelang.autotuner import set_autotune_inputs
-    _AUTOTUNE_AVAILABLE = True
-except ImportError:
-    _AUTOTUNE_AVAILABLE = False
-    is_warming_up = lambda: False
-    set_autotune_inputs = lambda *args, **kwargs: lambda f: f
 
 
 @register_linear_strategy(weight_dtype="fp8_e4m3", act_dtype="bf16")
@@ -61,166 +31,76 @@ def _build_linear_fp8_e5m2_w8a16() -> LinearQuantizationStrategy:
 
 
 class LinearFP8W8A16Strategy(LinearQuantizationStrategy):
-    """FP8 W8A16 Linear strategy: FP8 weight quantization + bf16 activation.
-    
-    Current implementation: Python reference using dequantized weights + F.linear.
-    Weight quantization: per-output-channel FP8 quantization (fp8_e4m3 or fp8_e5m2).
-    Activation: kept as bf16 (no activation quantization).
-    
-    Lazy cache: Quantized weights are cached per weight tensor (by id) to avoid
-    re-quantizing on every forward pass.
-    """
-    
-    def __init__(self, weight_dtype: str = "fp8_e4m3"):
-        """
-        Initialize FP8 W8A16 strategy.
-        
-        Args:
-            weight_dtype: FP8 dtype string ("fp8_e4m3" or "fp8_e5m2")
-        """
+    def __init__(self, weight_dtype: str = "fp8_e4m3") -> None:
         super().__init__()
         self.weight_dtype_str = weight_dtype
-        self.spec = parse_kv_cache_dtype(weight_dtype)
-        if not self.spec.is_fp8:
-            raise ValueError(f"Expected FP8 dtype, got {weight_dtype}")
-        
-        # Cache: weight_id -> (quantized_weight_uint8, scales_float32)
-        # Using id(weight) as key since the same Parameter object is reused across forwards
+        # Cache: id(weight) -> (q_fp8_KN [K,N], scale_fp32 [1])
         self._weight_cache: dict[int, tuple[torch.Tensor, torch.Tensor]] = {}
-        # Optional cache: weight_id -> bf16 dequantized weight (speed-first; uses extra memory)
-        self._dequant_weight_cache: dict[int, torch.Tensor] = {}
-        # TileLang autotune config cache: (device, M_bucket, N, K) -> config dict
-        self._tl_autotune_config_cache: dict[tuple[str, int, int, int], dict] = {}
-    
+
+        try:
+            from vllm.model_executor.layers.quantization.utils.w8a8_utils import (  # type: ignore
+                Fp8LinearOp,
+            )
+        except Exception as e:  # pragma: no cover
+            raise RuntimeError("FP8 需要 vLLM（Fp8LinearOp / _custom_ops）。") from e
+
+        # dynamic activation quantization to FP8 inside vLLM
+        self._fp8_linear = Fp8LinearOp(act_quant_static=False)
+
     @property
     def name(self) -> str:
         return f"linear_fp8_{self.weight_dtype_str}_w8a16"
-    
+
     @property
     def linear_weight_format(self) -> str:
         return self.weight_dtype_str
-    
+
     @property
     def linear_act_format(self) -> str:
         return "bf16"
-    
+
     def get_storage_dtype(self) -> tuple[torch.dtype, int]:
-        # FP8 weights are stored as uint8 (1 byte per element)
+        # vLLM stores fp8 weights as float8 dtype tensor
         return torch.uint8, 1
-    
-    def quantize(self, tensor: torch.Tensor, **kwargs) -> tuple[torch.Tensor, Any]:
-        """Quantize tensor to FP8 with per-channel (per-output) scales.
-        
-        Args:
-            tensor: Weight tensor of shape [out_features, in_features]
-            **kwargs: Additional arguments (unused for now)
-        
-        Returns:
-            (quantized_tensor_uint8, scales_float32): quantized_tensor is uint8 (FP8 storage),
-                scales is [out_features]
-        """
-        _ = kwargs
-        assert self.spec.fp8_view_dtype is not None
-        assert self.spec.fp8_min is not None and self.spec.fp8_max is not None
-        
-        # Per-output-channel quantization: compute scale for each output channel
-        # shape: [out_features, in_features] -> scales shape: [out_features]
-        abs_max = torch.abs(tensor).max(dim=-1, keepdim=True)[0]  # [out_features, 1]
-        eps = 1e-8
-        fp8_max = float(self.spec.fp8_max)
-        
-        # Compute scales: abs_max / fp8_max
-        scales = (abs_max.clamp(min=eps) / fp8_max).to(torch.float32)  # [out_features, 1]
-        
-        # Quantize: clamp(tensor / scale, fp8_min, fp8_max).to(fp8_dtype).view(uint8)
-        descale = 1.0 / scales  # [out_features, 1]
-        quantized = (tensor.to(torch.float32) * descale).clamp(
-            min=float(self.spec.fp8_min),
-            max=float(self.spec.fp8_max)
-        )
-        quantized_fp8 = quantized.to(self.spec.fp8_view_dtype)
-        quantized_uint8 = quantized_fp8.view(torch.uint8)
-        
-        scales_1d = scales.squeeze(-1)  # [out_features]
-        
-        return quantized_uint8, scales_1d
-    
-    def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) -> torch.Tensor:
-        """Dequantize FP8 tensor back to bf16 using per-channel scales.
-        
-        Args:
-            quantized: uint8 tensor [out_features, in_features] (FP8 storage)
-            scale_or_metadata: scales tensor [out_features] or dict with 'scales'
-            **kwargs: Additional arguments (unused for now)
-        
-        Returns:
-            Dequantized tensor in bf16
-        """
+
+    def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs: Any) -> tuple[int, ...]:
         _ = kwargs
-        assert self.spec.fp8_view_dtype is not None
-        
-        if isinstance(scale_or_metadata, dict):
-            scales = scale_or_metadata.get("scales")
-        else:
-            scales = scale_or_metadata
-        
-        if scales is None:
-            raise ValueError("scales required for dequantization")
-        
-        # View uint8 as FP8 dtype
-        fp8_tensor = quantized.view(self.spec.fp8_view_dtype).to(torch.float32)
-        
-        # Ensure scales have correct shape for broadcasting
-        if scales.dim() == 1:
-            scales = scales.unsqueeze(-1)  # [out_features, 1]
-        
-        # Dequantize: quantized * scales
-        dequantized = fp8_tensor * scales.to(torch.float32)
-        return dequantized.to(torch.bfloat16)
-    
-    def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs) -> tuple[int, ...]:
-        """Return shape of scales tensor for per-channel quantization.
-        
-        For [out_features, in_features] weight, scales shape is [out_features].
-        """
+        if len(original_shape) != 2:
+            raise ValueError(f"Expected 2D weight [N,K], got {original_shape}")
+        # per-tensor scale
+        return (1,)
+
+    def quantize(self, tensor: torch.Tensor, **kwargs: Any) -> tuple[torch.Tensor, Any]:
         _ = kwargs
-        if len(original_shape) < 2:
-            raise ValueError(f"Expected weight shape with at least 2 dims, got {original_shape}")
-        # Per-output-channel: scales shape is [out_features]
-        return (original_shape[0],)
-    
+        if tensor.dim() != 2:
+            raise ValueError(f"Expected 2D weight [N,K], got shape={tuple(tensor.shape)}")
+        from vllm import _custom_ops as ops  # type: ignore
+        from vllm.platforms import current_platform  # type: ignore
+
+        # vLLM: per-tensor scale, output dtype = current_platform.fp8_dtype()
+        q_fp8, scale = ops.scaled_fp8_quant(tensor.to(torch.float32).contiguous(), scale=None)
+        # Keep transpose-view for CUTLASS expectation (b.stride(0) == 1).
+        q_kn_fp8 = q_fp8.t()  # [K,N] fp8 dtype, non-contiguous
+        scale = scale.to(torch.float32).reshape(1).contiguous()
+        return q_kn_fp8, {"scales": scale, "fp8_dtype": current_platform.fp8_dtype()}
+
     def quantize_weight_for_kernel(
         self,
         weight: torch.Tensor,
         *,
         device: torch.device | None = None,
-        **kwargs: Any,
+        **_: Any,
     ) -> tuple[torch.Tensor, Any]:
-        """Quantize weight to FP8 with per-channel scales.
-        
-        Returns:
-            (quantized_weight_uint8, scales_float32): quantized_weight is uint8 [out, in],
-                scales is float32 [out]
-        """
-        _ = kwargs
+        q_fp8, meta = self.quantize(weight)
         if device is not None:
-            weight = weight.to(device=device)
-        
-        quantized, scales = self.quantize(weight)
-        return quantized, scales
-    
-    def quantize_act_for_kernel(
-        self,
-        x: torch.Tensor,
-        *,
-        device: torch.device | None = None,
-        **kwargs: Any,
-    ) -> tuple[torch.Tensor, Any]:
-        """No activation quantization for W8A16 (activation stays bf16)."""
-        if device is not None:
-            x = x.to(device=device)
-        return x, None
-    
+            q_fp8 = q_fp8.to(device=device)
+            meta["scales"] = meta["scales"].to(device=device)
+        return q_fp8, meta["scales"]
+
+    def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs: Any) -> torch.Tensor:
+        _ = kwargs
+        raise RuntimeError("FP8 不提供 dequantize 路径（避免走慢的反量化 + F.linear）。")
+
     def linear_forward(
         self,
         x: torch.Tensor,
@@ -230,184 +110,33 @@ def linear_forward(
         quant_kind: str,
         **kwargs: Any,
     ) -> torch.Tensor:
-        """Compute Linear output using quantized FP8 weights (W8A16).
-        
-        Uses Python reference implementation (dequant + F.linear).
-        Future: can integrate TileLang kernel if available.
-        """
         _ = quant_kind
-        
-        # If caller provides a pre-quantized uint8 weight + scales (e.g., load-time quantized module),
-        # use them directly and DO NOT populate the lazy cache (to avoid double-storage).
-        quant_scales = kwargs.pop("quant_scales", None)
-        if weight.dtype == torch.uint8:
-            if quant_scales is None:
-                raise ValueError("weight is uint8 (FP8) but quant_scales is None; expected per-channel scales tensor")
-            quantized_weight = weight
-            scales = quant_scales
-            if scales.dtype != torch.float32:
-                scales = scales.to(dtype=torch.float32)
-            if quantized_weight.device != x.device:
-                quantized_weight = quantized_weight.to(device=x.device)
-            if scales.device != x.device:
-                scales = scales.to(device=x.device)
-        else:
-            # Lazy cache: use weight tensor id as key (only for bf16/fp16/fp32 weights)
-            weight_id = id(weight)
-            
-            # Check cache
-            if weight_id in self._weight_cache:
-                quantized_weight, scales = self._weight_cache[weight_id]
-                # Ensure cached tensors are on the correct device
-                if quantized_weight.device != x.device:
-                    quantized_weight = quantized_weight.to(device=x.device)
-                    scales = scales.to(device=x.device)
-            else:
-                # Quantize weight and cache it
-                quantized_weight, scales = self.quantize_weight_for_kernel(weight, device=x.device)
-                # Cache the quantized weight and scales
-                self._weight_cache[weight_id] = (quantized_weight, scales)
-        
-        # Speed-first option: cache dequantized bf16 weight for F.linear (cuBLAS)
-        # This trades extra GPU memory for throughput.
-        import os
-        if os.getenv("DIFFULEX_FP8_W8A16_PREFER_CUBLAS", "0") == "1":
-            deq_key = id(weight) if weight.dtype != torch.uint8 else id(quantized_weight)
-            deq_w = self._dequant_weight_cache.get(deq_key)
-            if deq_w is None or deq_w.device != x.device:
-                # Dequantize: FP8[N,K] * scales[N] -> bf16[N,K]
-                deq_w = self.dequantize(quantized_weight, scales)
-                self._dequant_weight_cache[deq_key] = deq_w
-            return F.linear(x, deq_w, bias)
-        
-        # Try to use TileLang kernel if available
-        fp8_w8a16_gemm = None
-        if self.weight_dtype_str == "fp8_e4m3":
-            fp8_w8a16_gemm = _fp8_e4m3_w8a16_gemm
-        elif self.weight_dtype_str == "fp8_e5m2":
-            fp8_w8a16_gemm = _fp8_e5m2_w8a16_gemm
-        
-        if _TILELANG_AVAILABLE and fp8_w8a16_gemm is not None:
-            try:
-                # Check device
-                if x.device.type != 'cuda':
-                    return self._fallback_python_forward(x, quantized_weight, scales, bias)
-                
-                # Get shapes
-                M, K = x.shape
-                N, K_w = quantized_weight.shape
-                assert K == K_w, f"K dimension mismatch: {K} != {K_w}"
-                
-                # Bucket M to reduce compilation churn
-                M_bucket = M
-                if M > 1:
-                    if M <= 64:
-                        M_bucket = 1 << (M - 1).bit_length()
-                    else:
-                        M_bucket = ((M + 63) // 64) * 64
-
-                x_for_kernel = x
-                if M_bucket != M:
-                    x_pad = torch.zeros((M_bucket, K), device=x.device, dtype=x.dtype)
-                    x_pad[:M, :] = x
-                    x_for_kernel = x_pad
+        from vllm.platforms import current_platform  # type: ignore
 
-                # TileLang autotune: use warmup + config cache pattern
-                cache_key = (str(x.device), M_bucket, N, K)
-                config = self._tl_autotune_config_cache.get(cache_key)
-                
-                if _AUTOTUNE_AVAILABLE and is_warming_up() and config is None:
-                    # Warmup phase: run autotune with real inputs
-                    try:
-                        assert self.spec.fp8_view_dtype is not None
-                        qweight_fp8 = quantized_weight.view(self.spec.fp8_view_dtype)
-                        with set_autotune_inputs([x_for_kernel, qweight_fp8, scales]):
-                            kernel = fp8_w8a16_gemm(M_bucket, N, K)
-                        config = kernel.config
-                        self._tl_autotune_config_cache[cache_key] = config
-                    except Exception:
-                        # Fallback to default config if autotune fails
-                        config = None
-                
-                # Use cached config or default parameters
-                assert self.spec.fp8_view_dtype is not None
-                qweight_fp8 = quantized_weight.view(self.spec.fp8_view_dtype)
-                if config is not None:
-                    kernel = fp8_w8a16_gemm(M_bucket, N, K, **config)
-                else:
-                    # Default config (backward compatible)
-                    kernel = fp8_w8a16_gemm(M_bucket, N, K, block_M=64, block_N=64, block_K=128, num_stages=2, threads=128)
-                
-                # Call kernel - out_idx=[3] means output is the 4th parameter
-                assert self.spec.fp8_view_dtype is not None
-                qweight_fp8 = quantized_weight.view(self.spec.fp8_view_dtype)
-                output_full = kernel(x_for_kernel, qweight_fp8, scales)
-                output = output_full[:M, :] if M_bucket != M else output_full
-                
-                # Add bias if present
-                if bias is not None:
-                    output = output + bias
-                
-                return output
-            except Exception as e:
-                # Fallback to Python implementation on any error
-                import warnings
-                error_msg = str(e)
-                
-                # Extract meaningful error information
-                if 'sm_' in error_msg and ('not defined' in error_msg or 'fatal' in error_msg):
-                    # CUDA architecture not supported - silently fallback
-                    pass
-                elif 'Compilation error' in error_msg:
-                    # Extract the actual error
-                    idx = error_msg.find('Compilation error')
-                    after = error_msg[idx + len('Compilation error'):]
-                    lines = after.split('\n')
-                    for line in lines:
-                        line = line.strip()
-                        if line and not line.startswith('#') and ('error:' in line.lower() or 'fatal' in line.lower()):
-                            error_msg = f"CUDA compilation error: {line[:200]}"
-                            break
-                    else:
-                        error_msg = "CUDA compilation error (see logs for details)"
-                elif 'pipeline' in error_msg.lower() and 'stage' in error_msg.lower():
-                    # Pipeline stages mismatch - silently fallback
-                    pass
-                else:
-                    # Truncate very long error messages
-                    if len(error_msg) > 200:
-                        error_msg = error_msg[:200] + "..."
-                
-                # Only warn for unexpected errors
-                if 'CUDA architecture not supported' not in error_msg and 'sm_' not in error_msg and 'Pipeline stages' not in error_msg:
-                    warnings.warn(
-                        f"TileLang kernel failed, falling back to Python implementation: {error_msg}",
-                        UserWarning,
-                    )
-                return self._fallback_python_forward(x, quantized_weight, scales, bias)
+        quant_scales = kwargs.get("quant_scales", None)
+        if weight is not None and quant_scales is not None:
+            # Expected: weight is fp8 K×N tensor (transpose-view is fine).
+            q_kn = weight.to(device=x.device)
+            scales = quant_scales.to(device=x.device, dtype=torch.float32).reshape(1)
         else:
-            # TileLang not available, use Python reference
-            return self._fallback_python_forward(x, quantized_weight, scales, bias)
-    
-    def _fallback_python_forward(
-        self,
-        x: torch.Tensor,
-        quantized_weight: torch.Tensor,
-        scales: torch.Tensor,
-        bias: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        """Fallback Python implementation: dequantize + F.linear."""
-        # Dequantize for reference implementation
-        dequantized_weight = self.dequantize(quantized_weight, scales)
-        
-        # Compute linear output
-        return F.linear(x, dequantized_weight, bias)
-    
-    def clear_cache(self) -> None:
-        """Clear the weight quantization cache.
-        
-        Useful for memory management or when weights are updated (e.g., fine-tuning).
-        """
-        self._weight_cache.clear()
-        self._dequant_weight_cache.clear()
+            wid = id(weight)
+            cached = self._weight_cache.get(wid)
+            if cached is None or cached[0].device != x.device:
+                q_fp8, meta = self.quantize(weight)
+                q_fp8 = q_fp8.to(device=x.device)
+                scales = meta["scales"].to(device=x.device, dtype=torch.float32).reshape(1)
+                q_kn = q_fp8
+                self._weight_cache[wid] = (q_fp8, scales)
+            else:
+                q_kn, scales = cached
+
+        # vLLM Fp8LinearOp expects weight as [K,N] fp8 tensor and per-tensor scale.
+        return self._fp8_linear.apply(
+            input=x,
+            weight=q_kn,
+            weight_scale=scales,
+            out_dtype=x.dtype if x.dtype in (torch.bfloat16, torch.float16) else torch.bfloat16,
+            input_scale=None,
+            bias=bias,
+        )
 
diff --git a/diffulex/utils/quantization/strategies/linear_fp8_w8a8.py b/diffulex/utils/quantization/strategies/linear_fp8_w8a8.py
index 73c7965..d7f48c6 100644
--- a/diffulex/utils/quantization/strategies/linear_fp8_w8a8.py
+++ b/diffulex/utils/quantization/strategies/linear_fp8_w8a8.py
@@ -1,14 +1,9 @@
 """
-FP8 W8A8 Linear quantization strategy (FP8 weight + FP8 activation).
+FP8 W8A8 Linear quantization strategy (FP8 weight + FP8 activation), TileLang-free.
 
-Implementation notes:
-- Weight quantization: per-output-channel FP8 quantization (fp8_e4m3 or fp8_e5m2)
-- Activation quantization: per-row FP8 quantization
-- Storage: FP8 weights and activations use uint8 storage + view(fp8_dtype) pattern
-- Scale management:
-  - Weight scales: per-channel [out_features], dtype: float16
-  - Activation scales: per-row [M], dtype: float32
-- Forward path: Python fallback (dequantize both FP8 weight and activation → bf16, then F.linear)
+vLLM-aligned implementation:
+- Weight quantization: `vllm._custom_ops.scaled_fp8_quant` (per-tensor scale).
+- Activation quantization + GEMM: vLLM `Fp8LinearOp` (CUTLASS scaled_mm when available).
 """
 
 from __future__ import annotations
@@ -16,75 +11,19 @@
 from typing import Any, Optional
 
 import torch
-import torch.nn.functional as F
 
 from diffulex.utils.quantization.registry import register_linear_strategy
 from diffulex.utils.quantization.strategy import LinearQuantizationStrategy
-from diffulex.utils.quantization.kv_cache_dtype import (
-    parse_kv_cache_dtype,
-    _get_fp8_e4m3_dtype,
-    _get_fp8_e5m2_dtype,
-)
 
-# Try to import TileLang kernels, fallback to None if not available
-_TILELANG_AVAILABLE = False
-_fp8_e4m3_w8a8_gemm = None
-_fp8_e5m2_w8a8_gemm = None
 
-try:
-    from diffulex_kernel.python.linear_kernels import (
-        fp8_e4m3_w8a8_gemm,
-        fp8_e5m2_w8a8_gemm,
-    )
-    _TILELANG_AVAILABLE = True
-    _fp8_e4m3_w8a8_gemm = fp8_e4m3_w8a8_gemm
-    _fp8_e5m2_w8a8_gemm = fp8_e5m2_w8a8_gemm
-except ImportError:
-    pass
-
-try:
-    from diffulex.attention.metadata import is_warming_up
-    from tilelang.autotuner import set_autotune_inputs
-    _AUTOTUNE_AVAILABLE = True
-except ImportError:
-    _AUTOTUNE_AVAILABLE = False
-    is_warming_up = lambda: False
-    set_autotune_inputs = lambda *args, **kwargs: lambda f: f
-
-
-def _quantize_per_row_fp8(
-    x: torch.Tensor,
-    fp8_view_dtype: torch.dtype,
-    fp8_min: float,
-    fp8_max: float,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    """Per-row symmetric FP8 quantization.
-    
-    Args:
-        x: Input tensor [M, K] in bf16/fp16/fp32
-        fp8_view_dtype: FP8 dtype (e.g., torch.float8_e4m3fn)
-        fp8_min: Minimum FP8 value
-        fp8_max: Maximum FP8 value
-    
-    Returns:
-        x_q: uint8 [M, K] (FP8 storage)
-        x_scales: float32 [M] where dequant is x_q.view(fp8_dtype).float() * x_scales[:, None]
-    """
-    # x: [M, K]
-    abs_max = x.abs().amax(dim=-1, keepdim=False)  # [M]
-    eps = 1e-8
-    scales = (abs_max.clamp(min=eps) / fp8_max).to(torch.float32)  # [M]
-    
-    # Quantize: clamp(x / scale, fp8_min, fp8_max).to(fp8_dtype).view(uint8)
-    descale = 1.0 / scales.unsqueeze(-1)  # [M, 1]
-    quantized = (x.to(torch.float32) * descale).clamp(
-        min=fp8_min,
-        max=fp8_max
-    )
-    quantized_fp8 = quantized.to(fp8_view_dtype)
-    quantized_uint8 = quantized_fp8.view(torch.uint8)
-    
-    return quantized_uint8, scales
+def _require_fp8_linear_op():
+    try:
+        from vllm.model_executor.layers.quantization.utils.w8a8_utils import (  # type: ignore
+            Fp8LinearOp,
+        )
+    except Exception as e:  # pragma: no cover
+        raise RuntimeError("FP8 需要 vLLM（Fp8LinearOp / _custom_ops）。") from e
+    return Fp8LinearOp
 
 
 @register_linear_strategy(weight_dtype="fp8_e4m3", act_dtype="fp8_e4m3")
@@ -98,189 +37,65 @@ def _build_linear_fp8_e5m2_w8a8() -> LinearQuantizationStrategy:
 
 
 class LinearFP8W8A8Strategy(LinearQuantizationStrategy):
-    """FP8 W8A8 Linear strategy: FP8 weight + FP8 activation quantization, output bf16.
-    
-    Current implementation: Python reference using dequantized weights and activations + F.linear.
-    Weight quantization: per-output-channel FP8 quantization.
-    Activation quantization: per-row FP8 quantization.
-    """
-    
-    def __init__(self, weight_dtype: str = "fp8_e4m3", act_dtype: str = "fp8_e4m3"):
-        """
-        Initialize FP8 W8A8 strategy.
-        
-        Args:
-            weight_dtype: FP8 dtype string for weights ("fp8_e4m3" or "fp8_e5m2")
-            act_dtype: FP8 dtype string for activations ("fp8_e4m3" or "fp8_e5m2")
-        """
+    def __init__(self, weight_dtype: str = "fp8_e4m3", act_dtype: str = "fp8_e4m3") -> None:
         super().__init__()
         self.weight_dtype_str = weight_dtype
         self.act_dtype_str = act_dtype
-        self.weight_spec = parse_kv_cache_dtype(weight_dtype)
-        self.act_spec = parse_kv_cache_dtype(act_dtype)
-        if not self.weight_spec.is_fp8 or not self.act_spec.is_fp8:
-            raise ValueError(f"Expected FP8 dtypes, got weight={weight_dtype}, act={act_dtype}")
-        
-        # Cache: weight_id -> (quantized_weight_uint8, scales_float16)
+        # Cache: id(weight) -> (q_fp8_KN [K,N], scale_fp32 [1])
         self._weight_cache: dict[int, tuple[torch.Tensor, torch.Tensor]] = {}
-        # Optional cache: weight_id -> bf16 dequantized weight (speed-first; uses extra memory)
-        self._dequant_weight_cache: dict[int, torch.Tensor] = {}
-        # TileLang autotune config cache: (device, M_bucket, N, K) -> config dict
-        self._tl_autotune_config_cache: dict[tuple[str, int, int, int], dict] = {}
-    
+        Fp8LinearOp = _require_fp8_linear_op()
+        self._fp8_linear = Fp8LinearOp(act_quant_static=False)
+
     @property
     def name(self) -> str:
         return f"linear_fp8_{self.weight_dtype_str}_w8a8"
-    
+
     @property
     def linear_weight_format(self) -> str:
         return self.weight_dtype_str
-    
+
     @property
     def linear_act_format(self) -> str:
         return self.act_dtype_str
-    
+
     def get_storage_dtype(self) -> tuple[torch.dtype, int]:
-        # FP8 weights are stored as uint8 (1 byte per element)
         return torch.uint8, 1
-    
-    def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs) -> tuple[int, ...]:
-        """Return shape of scales tensor for per-channel quantization.
-        
-        For [out_features, in_features] weight, scales shape is [out_features].
-        """
-        _ = kwargs
-        if len(original_shape) < 2:
-            raise ValueError(f"Expected weight shape with at least 2 dims, got {original_shape}")
-        # Per-output-channel: scales shape is [out_features]
-        return (original_shape[0],)
-    
-    def clear_cache(self) -> None:
-        self._weight_cache.clear()
-        self._dequant_weight_cache.clear()
-    
-    def quantize(self, tensor: torch.Tensor, **kwargs) -> tuple[torch.Tensor, Any]:
-        """Quantize tensor to FP8 with per-channel (per-output) scales.
-        
-        Args:
-            tensor: Weight tensor of shape [out_features, in_features]
-            **kwargs: Additional arguments (unused for now)
-        
-        Returns:
-            (quantized_tensor_uint8, scales_float16): quantized_tensor is uint8 (FP8 storage),
-                scales is float16 [out_features]
-        """
+
+    def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs: Any) -> tuple[int, ...]:
         _ = kwargs
-        assert self.weight_spec.fp8_view_dtype is not None
-        assert self.weight_spec.fp8_min is not None and self.weight_spec.fp8_max is not None
-        
-        # Per-output-channel quantization: compute scale for each output channel
-        # shape: [out_features, in_features] -> scales shape: [out_features]
-        abs_max = torch.abs(tensor).max(dim=-1, keepdim=True)[0]  # [out_features, 1]
-        eps = 1e-8
-        fp8_max = float(self.weight_spec.fp8_max)
-        
-        # Compute scales: abs_max / fp8_max
-        # Use float16 for weight scales (W8A8 paths are sensitive to scale precision)
-        scales = (abs_max.clamp(min=eps) / fp8_max).to(torch.float16)  # [out_features, 1]
-        
-        # Quantize: clamp(tensor / scale, fp8_min, fp8_max).to(fp8_dtype).view(uint8)
-        descale = 1.0 / scales  # [out_features, 1]
-        quantized = (tensor.to(torch.float32) * descale).clamp(
-            min=float(self.weight_spec.fp8_min),
-            max=float(self.weight_spec.fp8_max)
-        )
-        quantized_fp8 = quantized.to(self.weight_spec.fp8_view_dtype)
-        quantized_uint8 = quantized_fp8.view(torch.uint8)
-        
-        scales_1d = scales.squeeze(-1)  # [out_features]
-        
-        return quantized_uint8, scales_1d
-    
-    def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) -> torch.Tensor:
-        """Dequantize FP8 tensor back to bf16 using per-channel scales.
-        
-        Args:
-            quantized: uint8 tensor [out_features, in_features] (FP8 storage)
-            scale_or_metadata: scales tensor [out_features] or dict with 'scales'
-            **kwargs: Additional arguments (unused for now)
-        
-        Returns:
-            Dequantized tensor in bf16
-        """
+        if len(original_shape) != 2:
+            raise ValueError(f"Expected 2D weight [N,K], got {original_shape}")
+        return (1,)
+
+    def quantize(self, tensor: torch.Tensor, **kwargs: Any) -> tuple[torch.Tensor, Any]:
         _ = kwargs
-        assert self.weight_spec.fp8_view_dtype is not None
-        
-        if isinstance(scale_or_metadata, dict):
-            scales = scale_or_metadata.get("scales")
-        else:
-            scales = scale_or_metadata
-        
-        if scales is None:
-            raise ValueError("scales required for dequantization")
-        
-        # View uint8 as FP8 dtype
-        fp8_tensor = quantized.view(self.weight_spec.fp8_view_dtype).to(torch.float32)
-        
-        # Ensure scales have correct shape for broadcasting
-        if scales.dim() == 1:
-            scales = scales.unsqueeze(-1)  # [out_features, 1]
-        
-        # Dequantize: quantized * scales
-        dequantized = fp8_tensor * scales.to(torch.float32)
-        return dequantized.to(torch.bfloat16)
-    
+        if tensor.dim() != 2:
+            raise ValueError(f"Expected 2D weight [N,K], got shape={tuple(tensor.shape)}")
+        from vllm import _custom_ops as ops  # type: ignore
+        from vllm.platforms import current_platform  # type: ignore
+
+        q_fp8, scale = ops.scaled_fp8_quant(tensor.to(torch.float32).contiguous(), scale=None)
+        q_kn_fp8 = q_fp8.t()  # [K,N], stride(0)==1
+        scale = scale.to(torch.float32).reshape(1).contiguous()
+        return q_kn_fp8, {"scales": scale, "fp8_dtype": current_platform.fp8_dtype()}
+
     def quantize_weight_for_kernel(
         self,
         weight: torch.Tensor,
         *,
         device: torch.device | None = None,
-        **kwargs: Any,
+        **_: Any,
     ) -> tuple[torch.Tensor, Any]:
-        """Quantize weight to FP8 with per-channel scales.
-        
-        Returns:
-            (quantized_weight_uint8, scales_float16): quantized_weight is uint8 [out, in],
-                scales is float16 [out]
-        """
-        _ = kwargs
+        q_fp8, meta = self.quantize(weight)
         if device is not None:
-            weight = weight.to(device=device)
-        
-        quantized, scales = self.quantize(weight)
-        return quantized, scales
-    
-    def quantize_act_for_kernel(
-        self,
-        x: torch.Tensor,
-        *,
-        device: torch.device | None = None,
-        **kwargs: Any,
-    ) -> tuple[torch.Tensor, Any]:
-        """Quantize activation to FP8 with per-row scales.
-        
-        Returns:
-            (quantized_act_uint8, scales_float32): quantized_act is uint8 [M, K],
-                scales is float32 [M]
-        """
-        if device is not None:
-            x = x.to(device=device)
-        
-        assert self.act_spec.fp8_view_dtype is not None
-        assert self.act_spec.fp8_min is not None and self.act_spec.fp8_max is not None
-        
-        # Ensure input is in a compatible dtype
-        if x.dtype not in (torch.bfloat16, torch.float16, torch.float32):
-            x = x.to(torch.bfloat16)
-        
-        quantized, scales = _quantize_per_row_fp8(
-            x,
-            self.act_spec.fp8_view_dtype,
-            float(self.act_spec.fp8_min),
-            float(self.act_spec.fp8_max),
-        )
-        return quantized, scales
-    
+            q_fp8 = q_fp8.to(device=device)
+            meta["scales"] = meta["scales"].to(device=device)
+        return q_fp8, meta["scales"]
+
+    def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs: Any) -> torch.Tensor:
+        _ = kwargs
+        raise RuntimeError("FP8 不提供 dequantize 路径（避免走慢的反量化 + F.linear）。")
+
     def linear_forward(
         self,
         x: torch.Tensor,
@@ -290,218 +105,25 @@ def linear_forward(
         quant_kind: str,
         **kwargs: Any,
     ) -> torch.Tensor:
-        """Compute Linear output using quantized FP8 weights and activations (W8A8).
-        
-        Uses Python reference implementation (dequantize both + F.linear).
-        Future: can integrate TileLang kernel if available.
-        """
         _ = quant_kind
-        
-        quant_scales = kwargs.pop("quant_scales", None)
-        
-        # Resolve / cache quantized weight + scales
-        if weight.dtype == torch.uint8:
-            if quant_scales is None:
-                raise ValueError("weight is uint8 (FP8) but quant_scales is None; expected per-channel scales tensor")
-            qweight = weight if weight.device == x.device else weight.to(device=x.device)
-            w_scales = quant_scales
-            # Prefer float16 scales for quality
-            if w_scales.dtype != torch.float16:
-                w_scales = w_scales.to(dtype=torch.float16)
-            if w_scales.device != x.device:
-                w_scales = w_scales.to(device=x.device)
-            weight_id = id(weight)
+        wid = id(weight)
+        cached = self._weight_cache.get(wid)
+        if cached is None or cached[0].device != x.device:
+            q_fp8, meta = self.quantize(weight)
+            q_fp8 = q_fp8.to(device=x.device)
+            w_scale = meta["scales"].to(device=x.device, dtype=torch.float32).reshape(1)
+            self._weight_cache[wid] = (q_fp8, w_scale)
         else:
-            weight_id = id(weight)
-            cached = self._weight_cache.get(weight_id)
-            if cached is None:
-                qweight, w_scales = self.quantize_weight_for_kernel(weight, device=x.device)
-                self._weight_cache[weight_id] = (qweight, w_scales)
-            else:
-                qweight, w_scales = cached
-                if qweight.device != x.device:
-                    qweight = qweight.to(device=x.device)
-                    w_scales = w_scales.to(device=x.device)
-                    self._weight_cache[weight_id] = (qweight, w_scales)
-        
-        # Optional: use cuBLAS BF16 (dequant once)
-        import os
-        if os.getenv("DIFFULEX_FP8_W8A8_PREFER_CUBLAS", "0") == "1":
-            deq_key = weight_id
-            deq_w = self._dequant_weight_cache.get(deq_key)
-            if deq_w is None or deq_w.device != x.device:
-                deq_w = self.dequantize(qweight, w_scales)
-                self._dequant_weight_cache[deq_key] = deq_w
-            # Also dequantize activation
-            x_q_temp, x_scales_temp = self.quantize_act_for_kernel(x, device=x.device)
-            x_deq = self._dequantize_act(x_q_temp, x_scales_temp)
-            return F.linear(x_deq, deq_w, bias)
-        
-        # Quantize activation per-row
-        if x.dtype not in (torch.bfloat16, torch.float16, torch.float32):
-            x = x.to(torch.bfloat16)
-        x_q, x_scales = self.quantize_act_for_kernel(x, device=x.device)
-        
-        # Try to use TileLang kernel if available
-        # For W8A8, weight_dtype and act_dtype should match (both e4m3 or both e5m2)
-        fp8_w8a8_gemm = None
-        if self.weight_dtype_str == "fp8_e4m3" and self.act_dtype_str == "fp8_e4m3":
-            fp8_w8a8_gemm = _fp8_e4m3_w8a8_gemm
-        elif self.weight_dtype_str == "fp8_e5m2" and self.act_dtype_str == "fp8_e5m2":
-            fp8_w8a8_gemm = _fp8_e5m2_w8a8_gemm
-        
-        if _TILELANG_AVAILABLE and fp8_w8a8_gemm is not None:
-            try:
-                # Check device
-                if x.device.type != 'cuda':
-                    return self._fallback_python_forward(x_q, x_scales, qweight, w_scales, bias)
-                
-                # Get shapes
-                M, K = x_q.shape
-                N, K_w = qweight.shape
-                assert K == K_w, f"K dimension mismatch: {K} != {K_w}"
-                
-                # Bucket M to reduce compilation churn
-                M_bucket = M
-                if M > 1:
-                    if M <= 64:
-                        M_bucket = 1 << (M - 1).bit_length()
-                    else:
-                        M_bucket = ((M + 63) // 64) * 64
+            q_fp8, w_scale = cached
 
-                x_q_for_kernel = x_q
-                if M_bucket != M:
-                    x_q_pad = torch.zeros((M_bucket, K), device=x_q.device, dtype=x_q.dtype)
-                    x_q_pad[:M, :] = x_q
-                    x_q_for_kernel = x_q_pad
-                    # Pad scales as well
-                    x_scales_pad = torch.zeros((M_bucket,), device=x_scales.device, dtype=x_scales.dtype)
-                    x_scales_pad[:M] = x_scales
-                    x_scales = x_scales_pad
+        q_kn = q_fp8
 
-                # TileLang autotune: use warmup + config cache pattern
-                cache_key = (str(x.device), M_bucket, N, K)
-                config = self._tl_autotune_config_cache.get(cache_key)
-                
-                if _AUTOTUNE_AVAILABLE and is_warming_up() and config is None:
-                    # Warmup phase: run autotune with real inputs
-                    try:
-                        assert self.act_spec.fp8_view_dtype is not None
-                        assert self.weight_spec.fp8_view_dtype is not None
-                        x_fp8 = x_q_for_kernel.view(self.act_spec.fp8_view_dtype)
-                        w_fp8 = qweight.view(self.weight_spec.fp8_view_dtype)
-                        with set_autotune_inputs([x_fp8, w_fp8, x_scales, w_scales]):
-                            kernel = fp8_w8a8_gemm(M_bucket, N, K)
-                        config = kernel.config
-                        self._tl_autotune_config_cache[cache_key] = config
-                    except Exception:
-                        # Fallback to default config if autotune fails
-                        config = None
-                
-                # Use cached config or default parameters
-                assert self.act_spec.fp8_view_dtype is not None
-                assert self.weight_spec.fp8_view_dtype is not None
-                x_fp8 = x_q_for_kernel.view(self.act_spec.fp8_view_dtype)
-                w_fp8 = qweight.view(self.weight_spec.fp8_view_dtype)
-                if config is not None:
-                    kernel = fp8_w8a8_gemm(M_bucket, N, K, **config)
-                else:
-                    # Default config (backward compatible)
-                    kernel = fp8_w8a8_gemm(M_bucket, N, K, block_M=64, block_N=64, block_K=128, num_stages=2, threads=128)
-                
-                # Call kernel - out_idx=[4] means output is the 5th parameter
-                # Inputs: A/B are fp8 tensors (viewed from uint8 storage), scales are float32/float16.
-                assert self.act_spec.fp8_view_dtype is not None
-                assert self.weight_spec.fp8_view_dtype is not None
-                x_fp8 = x_q_for_kernel.view(self.act_spec.fp8_view_dtype)
-                w_fp8 = qweight.view(self.weight_spec.fp8_view_dtype)
-                output_full = kernel(x_fp8, w_fp8, x_scales, w_scales)
-                output = output_full[:M, :] if M_bucket != M else output_full
-                
-                # Add bias if present
-                if bias is not None:
-                    output = output + bias
-                
-                return output
-            except Exception as e:
-                # Fallback to Python implementation on any error
-                import warnings
-                error_msg = str(e)
-                
-                # Extract meaningful error information
-                if 'sm_' in error_msg and ('not defined' in error_msg or 'fatal' in error_msg):
-                    # CUDA architecture not supported - silently fallback
-                    pass
-                elif 'Compilation error' in error_msg:
-                    # Extract the actual error
-                    idx = error_msg.find('Compilation error')
-                    after = error_msg[idx + len('Compilation error'):]
-                    lines = after.split('\n')
-                    for line in lines:
-                        line = line.strip()
-                        if line and not line.startswith('#') and ('error:' in line.lower() or 'fatal' in line.lower()):
-                            error_msg = f"CUDA compilation error: {line[:200]}"
-                            break
-                    else:
-                        error_msg = "CUDA compilation error (see logs for details)"
-                elif 'pipeline' in error_msg.lower() and 'stage' in error_msg.lower():
-                    # Pipeline stages mismatch - silently fallback
-                    pass
-                else:
-                    # Truncate very long error messages
-                    if len(error_msg) > 200:
-                        error_msg = error_msg[:200] + "..."
-                
-                # Only warn for unexpected errors
-                if 'CUDA architecture not supported' not in error_msg and 'sm_' not in error_msg and 'Pipeline stages' not in error_msg:
-                    warnings.warn(
-                        f"TileLang kernel failed, falling back to Python implementation: {error_msg}",
-                        UserWarning,
-                    )
-                return self._fallback_python_forward(x_q, x_scales, qweight, w_scales, bias)
-        else:
-            # TileLang not available, use Python reference
-            return self._fallback_python_forward(x_q, x_scales, qweight, w_scales, bias)
-    
-    def _fallback_python_forward(
-        self,
-        x_q: torch.Tensor,
-        x_scales: torch.Tensor,
-        qweight: torch.Tensor,
-        w_scales: torch.Tensor,
-        bias: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        """Fallback Python implementation: dequantize both + F.linear."""
-        # Dequantize both weight and activation
-        deq_w = self.dequantize(qweight, w_scales)
-        deq_x = self._dequantize_act(x_q, x_scales)
-        
-        # Compute linear output
-        return F.linear(deq_x, deq_w, bias)
-    
-    def _dequantize_act(
-        self,
-        quantized: torch.Tensor,
-        scales: torch.Tensor,
-    ) -> torch.Tensor:
-        """Dequantize FP8 activation tensor.
-        
-        Args:
-            quantized: uint8 tensor [M, K] (FP8 storage)
-            scales: float32 tensor [M] (per-row scales)
-        
-        Returns:
-            Dequantized tensor in bf16 [M, K]
-        """
-        assert self.act_spec.fp8_view_dtype is not None
-        
-        # View uint8 as FP8 dtype
-        fp8_tensor = quantized.view(self.act_spec.fp8_view_dtype).to(torch.float32)
-        
-        # Reshape scales to broadcast: [M] -> [M, 1]
-        scales_view = scales.to(torch.float32).unsqueeze(-1)  # [M, 1]
-        
-        # Dequantize: value * scale
-        dequantized = fp8_tensor * scales_view
-        return dequantized.to(torch.bfloat16)
+        return self._fp8_linear.apply(
+            input=x,
+            weight=q_kn,
+            weight_scale=w_scale,
+            out_dtype=x.dtype if x.dtype in (torch.bfloat16, torch.float16) else torch.bfloat16,
+            input_scale=None,
+            bias=bias,
+        )
 
diff --git a/diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py b/diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py
new file mode 100644
index 0000000..da81d3e
--- /dev/null
+++ b/diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py
@@ -0,0 +1,156 @@
+"""
+GPTQ Marlin (W4/W8, A16) Linear strategy using vLLM Marlin CUDA kernels.
+
+- Input activations: bf16 (cast to fp16 for vLLM marlin kernel)
+- Weights: offline GPTQ vLLM standard format (qweight/qzeros/scales/g_idx)
+- One-time repack/permutation is performed by Diffulex `LinearBase` and passed in via kwargs:
+  - gptq_marlin_qweight / gptq_marlin_scales / gptq_marlin_zp
+  - gptq_marlin_g_idx / gptq_marlin_g_idx_sort_indices
+  - gptq_marlin_workspace
+
+This strategy intentionally does NOT fall back to F.linear silently: if marlin tensors
+are missing, it raises to avoid accidentally benchmarking a slow path.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Optional
+
+import torch
+import torch.nn.functional as F
+
+from diffulex.utils.quantization.registry import register_linear_strategy
+from diffulex.utils.quantization.strategy import LinearQuantizationStrategy
+
+try:
+    from vllm.model_executor.layers.quantization.utils.marlin_utils import (  # type: ignore
+        apply_gptq_marlin_linear,
+        marlin_is_k_full,
+        marlin_make_empty_g_idx,
+        marlin_permute_bias,
+    )
+    from vllm.scalar_type import scalar_types  # type: ignore
+except Exception:  # pragma: no cover
+    apply_gptq_marlin_linear = None  # type: ignore
+    marlin_is_k_full = None  # type: ignore
+    marlin_make_empty_g_idx = None  # type: ignore
+    marlin_permute_bias = None  # type: ignore
+    scalar_types = None  # type: ignore
+
+
+@register_linear_strategy(weight_dtype="gptq_marlin", act_dtype="bf16")
+def _build_linear_gptq_marlin_w4a16() -> LinearQuantizationStrategy:
+    return LinearGPTQMarlinW4A16Strategy()
+
+
+class LinearGPTQMarlinW4A16Strategy(LinearQuantizationStrategy):
+    @property
+    def name(self) -> str:
+        return "linear_gptq_marlin_w4a16"
+
+    @property
+    def linear_weight_format(self) -> str:
+        return "gptq_marlin"
+
+    @property
+    def linear_act_format(self) -> str:
+        return "bf16"
+
+    def get_storage_dtype(self) -> tuple[torch.dtype, int]:
+        return torch.int32, 4
+
+    def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs) -> tuple[int, ...]:
+        # Same as GPTQ: [K/group, N]
+        if len(original_shape) != 2:
+            raise ValueError(f"Expected 2D weight shape, got {original_shape}")
+        out_features, in_features = original_shape
+        group_size = int(kwargs.get("group_size", 128))
+        group_size = in_features if group_size == -1 else group_size
+        if group_size <= 0 or in_features % group_size != 0:
+            raise ValueError(f"Invalid group_size={group_size} for in_features={in_features}")
+        num_groups = in_features // group_size
+        return (num_groups, out_features)
+
+    def quantize(self, tensor: torch.Tensor, **kwargs) -> tuple[torch.Tensor, Any]:
+        return tensor, {}
+
+    def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) -> torch.Tensor:
+        if quantized.is_floating_point():
+            return quantized
+        raise NotImplementedError("GPTQ Marlin 不提供 Python dequantize；请使用 vLLM Marlin CUDA kernel。")
+
+    def linear_forward(
+        self,
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        bias: Optional[torch.Tensor],
+        *,
+        quant_kind: str,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        _ = quant_kind, weight
+        if apply_gptq_marlin_linear is None or scalar_types is None:
+            raise RuntimeError("gptq_marlin 需要 vLLM (marlin_utils + scalar_types)；当前环境不可用。")
+
+        qweight = kwargs.get("gptq_marlin_qweight", None)
+        scales = kwargs.get("gptq_marlin_scales", None)
+        zp = kwargs.get("gptq_marlin_zp", None)
+        g_idx = kwargs.get("gptq_marlin_g_idx", None)
+        g_idx_sort_indices = kwargs.get("gptq_marlin_g_idx_sort_indices", None)
+        workspace = kwargs.get("gptq_marlin_workspace", None)
+        in_features = int(kwargs.get("in_features", 0))
+        out_features = int(kwargs.get("out_features", 0))
+        weight_bits = int(kwargs.get("gptq_weight_bits", 0))
+
+        if any(t is None for t in (qweight, scales, zp, workspace)) or in_features <= 0 or out_features <= 0:
+            raise RuntimeError("gptq_marlin: missing prepared marlin tensors (qweight/scales/zp/workspace).")
+
+        if weight_bits == 4:
+            wtype = scalar_types.uint4b8
+        elif weight_bits == 8:
+            wtype = scalar_types.uint8b128
+        else:
+            raise RuntimeError(f"gptq_marlin: unsupported weight_bits={weight_bits} (expected 4 or 8)")
+
+        # vLLM marlin kernels expect FP16 activations.
+        x_in = x.to(dtype=torch.float16) if x.dtype != torch.float16 else x
+
+        # g_idx can be empty (desc_act=False). Ensure correct dtype/device.
+        if g_idx is None or (isinstance(g_idx, torch.Tensor) and g_idx.numel() == 0):
+            g_idx_t = marlin_make_empty_g_idx(x.device) if marlin_make_empty_g_idx is not None else torch.empty((0,), device=x.device, dtype=torch.int32)
+        else:
+            g_idx_t = g_idx.to(device=x.device, dtype=torch.int32)
+        if g_idx_sort_indices is None or (isinstance(g_idx_sort_indices, torch.Tensor) and g_idx_sort_indices.numel() == 0):
+            g_idx_sort_t = marlin_make_empty_g_idx(x.device) if marlin_make_empty_g_idx is not None else torch.empty((0,), device=x.device, dtype=torch.int32)
+        else:
+            g_idx_sort_t = g_idx_sort_indices.to(device=x.device, dtype=torch.int32)
+
+        # Determine whether K is full (needed by marlin kernel). Row-parallel layers set tp_dim=1 in Diffulex.
+        row_parallel = bool(kwargs.get("tp_dim", None) == 1)
+        has_g_idx = bool(g_idx_t.numel() > 0)
+        if marlin_is_k_full is None:
+            is_k_full = True
+        else:
+            is_k_full = marlin_is_k_full(has_g_idx, row_parallel)
+
+        marlin_bias = None
+        if bias is not None:
+            marlin_bias = marlin_permute_bias(bias) if marlin_permute_bias is not None else bias
+
+        out = apply_gptq_marlin_linear(
+            input=x_in,
+            weight=qweight,
+            weight_scale=scales,
+            weight_zp=zp,
+            g_idx=g_idx_t,
+            g_idx_sort_indices=g_idx_sort_t,
+            workspace=workspace,
+            wtype=wtype,
+            output_size_per_partition=out_features,
+            input_size_per_partition=in_features,
+            is_k_full=is_k_full,
+            bias=marlin_bias,
+            input_dtype=None,
+        )
+        return out.to(dtype=x.dtype) if out.dtype != x.dtype else out
+
diff --git a/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py b/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py
index c86c532..8fc67a5 100644
--- a/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py
+++ b/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py
@@ -1,11 +1,15 @@
 """
-GPTQ W4A16 Linear quantization strategy (GPTQ weight + bf16 activation).
+GPTQ W4A16 Linear quantization strategy (vLLM standard format).
 
-Implementation notes:
-- Weight quantization: GPTQ format with groupwise quantization
-- Activation: kept as bf16 (no activation quantization)
-- Storage: GPTQ uses packed int4 weights (qweight), int4 zeros (qzeros), and per-group scales
-- Forward path: Dequantize GPTQ weights to bf16, then use F.linear
+- Weight format: vLLM GPTQ (packed int32 qweight/qzeros + fp16 scales)
+- Activation: bf16 (no activation quantization)
+- Forward: vLLM custom op `gptq_gemm`
+
+Design notes:
+- Diffulex follows vLLM's fast path: run `gptq_shuffle` once (handled by
+  `LinearBase._maybe_prepare_offline_gptq`) and then call `gptq_gemm` with
+  `use_exllama=True`.
+- No TileLang dependency.
 """
 
 from __future__ import annotations
@@ -18,178 +22,10 @@
 from diffulex.utils.quantization.registry import register_linear_strategy
 from diffulex.utils.quantization.strategy import LinearQuantizationStrategy
 
-# Try to import TileLang kernel, fallback to None if not available
-_TILELANG_AVAILABLE = False
-try:
-    from diffulex_kernel.python.linear_kernels import gptq_w4a16_gemm
-    _TILELANG_AVAILABLE = True
-except ImportError:
-    gptq_w4a16_gemm = None
-
 try:
-    from diffulex.attention.metadata import is_warming_up
-    from tilelang.autotuner import set_autotune_inputs
-    _AUTOTUNE_AVAILABLE = True
-except ImportError:
-    _AUTOTUNE_AVAILABLE = False
-    is_warming_up = lambda: False
-    set_autotune_inputs = lambda *args, **kwargs: lambda f: f
-
-
-def _unpack_gptq_int4(
-    packed: torch.Tensor,
-    *,
-    out_features: int,
-    in_features: int,
-) -> torch.Tensor:
-    """Unpack GPTQ packed int4 weights into int8 values.
-
-    GPTQ packs 2 int4 values per int8 byte:
-    - Lower 4 bits: even columns
-    - Upper 4 bits: odd columns
-
-    Args:
-        packed: int8 tensor [out_features, (in_features + 1) // 2]
-        out_features: Original output features
-        in_features: Original input features
-
-    Returns:
-        unpacked: int8 tensor [out_features, in_features] with values in [-8, 7]
-    """
-    if packed.dtype != torch.int8:
-        raise TypeError(f"packed weight must be int8, got {packed.dtype}")
-
-    out_features_actual, packed_in = packed.shape
-    expected_packed_in = (in_features + 1) // 2
-    if packed_in != expected_packed_in:
-        raise ValueError(
-            f"Packed input dimension mismatch: got {packed_in}, "
-            f"expected {expected_packed_in} for in_features={in_features}"
-        )
-    if out_features_actual != out_features:
-        raise ValueError(
-            f"Output dimension mismatch: got {out_features_actual}, "
-            f"expected {out_features}"
-        )
-
-    # Interpret bytes as uint8 for bit manipulation
-    p_u8 = packed.view(torch.uint8)
-    # Extract lower and upper 4 bits
-    low_u8 = (p_u8 & 0x0F)  # [0..15]
-    high_u8 = ((p_u8 >> 4) & 0x0F)  # [0..15]
-
-    # Convert unsigned nibble [0..15] to signed int4 [-8..7]
-    # Packing: int4 [-8, 7] + 8 -> uint8 [0, 15]
-    # Unpacking: uint8 [0, 15] - 8 -> int4 [-8, 7]
-    low_s = low_u8.to(torch.int16) - 8
-    high_s = high_u8.to(torch.int16) - 8
-
-    # Interleave low/high along in_features
-    unpacked = torch.empty((out_features, packed_in * 2), device=packed.device, dtype=torch.int16)
-    unpacked[:, 0::2] = low_s
-    unpacked[:, 1::2] = high_s
-    unpacked = unpacked[:, :in_features].to(torch.int8)
-    return unpacked
-
-
-def _dequantize_gptq(
-    qweight: torch.Tensor,
-    qzeros: torch.Tensor,
-    scales: torch.Tensor,
-    *,
-    out_features: int,
-    in_features: int,
-    group_size: int = 128,
-    g_idx: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
-    """Dequantize GPTQ weights to bf16.
-
-    GPTQ uses groupwise quantization:
-    - Weight is quantized per group (group_size consecutive elements)
-    - Each group has its own scale and zero point
-    - g_idx (optional) maps each weight element to its group
-
-    Args:
-        qweight: int8 tensor [out_features, (in_features + 1) // 2] packed int4
-        qzeros: int8 tensor [(out_features + group_size - 1) // group_size, (in_features + 1) // 2] packed int4
-        scales: float32 tensor [(out_features + group_size - 1) // group_size, in_features]
-        out_features: Output features
-        in_features: Input features
-        group_size: Group size for quantization (default: 128)
-        g_idx: Optional int32 tensor [in_features] mapping each weight to its group
-
-    Returns:
-        dequantized: bf16 tensor [out_features, in_features]
-    """
-    device = qweight.device
-
-    # Unpack qweight to int8 [out_features, in_features]
-    w_int8 = _unpack_gptq_int4(qweight, out_features=out_features, in_features=in_features)
-
-    # Unpack qzeros to int8 [num_groups, in_features]
-    num_groups = (out_features + group_size - 1) // group_size
-    if qzeros.shape[0] != num_groups:
-        raise ValueError(
-            f"qzeros shape mismatch: got {qzeros.shape[0]} groups, "
-            f"expected {num_groups} for out_features={out_features}, group_size={group_size}"
-        )
-    zeros_int8 = _unpack_gptq_int4(qzeros, out_features=num_groups, in_features=in_features)
-
-    # Ensure scales have correct shape [num_groups, in_features]
-    if scales.shape != (num_groups, in_features):
-        # If scales is [num_groups] or [num_groups, 1], broadcast to [num_groups, in_features]
-        if scales.shape == (num_groups,) or scales.shape == (num_groups, 1):
-            scales = scales.unsqueeze(-1).expand(num_groups, in_features)
-        else:
-            raise ValueError(
-                f"scales shape mismatch: got {scales.shape}, "
-                f"expected ({num_groups}, {in_features}) or ({num_groups},) or ({num_groups}, 1)"
-            )
-
-    # Convert to float32 for dequantization
-    w_fp32 = w_int8.to(torch.float32)
-    zeros_int8_fp32 = zeros_int8.to(torch.float32)  # Quantized zeros (int8)
-    scales_fp32 = scales.to(torch.float32)
-    
-    # Dequantize zeros: zero = zero_quantized * scale
-    # zeros_int8 was quantized as: zero_quantized = round(zero / scale)
-    # So to recover: zero = zero_quantized * scale
-    zeros_fp32 = zeros_int8_fp32 * scales_fp32  # [num_groups, in_features]
-
-    # Dequantize: (weight - zero) * scale
-    # w_int8 is [out_features, in_features]
-    # zeros_int8 is [num_groups, in_features]
-    # scales_fp32 is [num_groups, in_features]
-
-    # For each output channel, determine which group it belongs to
-    if g_idx is not None:
-        # g_idx maps each output channel to its group
-        if g_idx.shape != (out_features,):
-            raise ValueError(
-                f"g_idx shape mismatch: got {g_idx.shape}, expected ({out_features},)"
-            )
-        # g_idx: [out_features] -> group_id for each output channel
-        group_ids = g_idx.to(torch.int64)  # [out_features]
-        # Clamp group_ids to valid range [0, num_groups-1]
-        group_ids = torch.clamp(group_ids, 0, num_groups - 1)
-        # Gather zeros and scales for each output channel
-        # zeros_fp32: [num_groups, in_features], group_ids: [out_features]
-        # We need to index along dimension 0 for each output channel
-        zeros_for_channel = zeros_fp32[group_ids]  # [out_features, in_features]
-        scales_for_channel = scales_fp32[group_ids]  # [out_features, in_features]
-    else:
-        # Without g_idx, assume sequential grouping: group_id = out_idx // group_size
-        group_ids = torch.arange(out_features, device=device) // group_size  # [out_features]
-        # Clamp group_ids to valid range
-        group_ids = torch.clamp(group_ids, 0, num_groups - 1)
-        zeros_for_channel = zeros_fp32[group_ids]  # [out_features, in_features]
-        scales_for_channel = scales_fp32[group_ids]  # [out_features, in_features]
-
-    # Dequantize: quantized * scale + zero
-    # Quantization formula: quantized = round((weight - zero) / scale)
-    # Dequantization formula: weight = quantized * scale + zero
-    dequantized = w_fp32 * scales_for_channel + zeros_for_channel
-    return dequantized.to(torch.bfloat16)
+    from vllm import _custom_ops as ops  # type: ignore
+except Exception:  # pragma: no cover
+    ops = None  # type: ignore
 
 
 @register_linear_strategy(weight_dtype="gptq", act_dtype="bf16")
@@ -198,21 +34,6 @@ def _build_linear_gptq_w4a16() -> LinearQuantizationStrategy:
 
 
 class LinearGPTQW4A16Strategy(LinearQuantizationStrategy):
-    """GPTQ W4A16 Linear strategy: GPTQ weight quantization + bf16 activation.
-
-    Current implementation: Python reference using dequantized weights + F.linear.
-    Weight quantization: GPTQ format with groupwise quantization (typically group_size=128).
-    Activation: kept as bf16 (no activation quantization).
-
-    Lazy cache: Dequantized weights are cached to avoid re-dequantizing on every forward pass.
-    """
-
-    def __init__(self):
-        """Initialize strategy (no cache needed when using kernel)."""
-        super().__init__()
-        # TileLang autotune config cache: (device, M_bucket, N, K, num_groups, group_size) -> config dict
-        self._tl_autotune_config_cache: dict[tuple[str, int, int, int, int, int], dict] = {}
-
     @property
     def name(self) -> str:
         return "linear_gptq_w4a16"
@@ -226,101 +47,33 @@ def linear_act_format(self) -> str:
         return "bf16"
 
     def get_storage_dtype(self) -> tuple[torch.dtype, int]:
-        # GPTQ weights are stored as packed int8 (2 int4 per byte)
-        return torch.int8, 1
+        # vLLM GPTQ stores packed weights in int32.
+        return torch.int32, 4
 
     def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs) -> tuple[int, ...]:
-        """Return shape of scales tensor for GPTQ groupwise quantization.
-
-        For [out_features, in_features] weight with group_size groups:
-        - scales shape is [(out_features + group_size - 1) // group_size, in_features]
-        """
-        if len(original_shape) < 2:
-            raise ValueError(f"Expected weight shape with at least 2 dims, got {original_shape}")
-        out_features, in_features = original_shape[0], original_shape[1]
-        group_size = kwargs.get("group_size", 128)
-        num_groups = (out_features + group_size - 1) // group_size
-        return (num_groups, in_features)
-
-    def quantize(self, tensor: torch.Tensor, **kwargs):
-        """GPTQ quantization is typically done offline, so this is a placeholder."""
+        # vLLM GPTQ scales: [K/group, N], where Linear weight is (N, K).
+        if len(original_shape) != 2:
+            raise ValueError(f"Expected 2D weight shape, got {original_shape}")
+        out_features, in_features = original_shape
+        group_size = int(kwargs.get("group_size", 128))
+        group_size = in_features if group_size == -1 else group_size
+        if group_size <= 0 or in_features % group_size != 0:
+            raise ValueError(f"Invalid group_size={group_size} for in_features={in_features}")
+        num_groups = in_features // group_size
+        return (num_groups, out_features)
+
+    def quantize(self, tensor: torch.Tensor, **kwargs) -> tuple[torch.Tensor, Any]:
+        # Offline GPTQ is handled by `diffulex.utils.quantization.quantize_model`.
+        return tensor, {}
+
+    def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) -> torch.Tensor:
+        if quantized.is_floating_point():
+            return quantized
         raise NotImplementedError(
-            "GPTQ quantization should be done offline using GPTQ tools. "
-            "This strategy only supports loading pre-quantized weights."
+            "GPTQ dequantize is not implemented in Diffulex. "
+            "Use vLLM kernels via linear_forward."
         )
 
-    def dequantize(
-        self,
-        quantized: torch.Tensor,
-        scale_or_metadata: Any,
-        **kwargs
-    ) -> torch.Tensor:
-        """Dequantize GPTQ weights.
-
-        Args:
-            quantized: Not used (kept for interface compatibility)
-            scale_or_metadata: Dict with keys:
-                - 'qweight': int8 packed int4 weights
-                - 'qzeros': int8 packed int4 zeros
-                - 'scales': float32 per-group scales
-                - 'out_features': int
-                - 'in_features': int
-                - 'group_size': int (default: 128)
-                - 'g_idx': Optional int32 group indices
-            **kwargs: Additional arguments
-
-        Returns:
-            Dequantized tensor in bf16
-        """
-        if not isinstance(scale_or_metadata, dict):
-            raise ValueError(
-                "GPTQ dequantize requires dict metadata with keys: "
-                "qweight, qzeros, scales, out_features, in_features, group_size (optional), g_idx (optional)"
-            )
-
-        qweight = scale_or_metadata["qweight"]
-        qzeros = scale_or_metadata["qzeros"]
-        scales = scale_or_metadata["scales"]
-        out_features = scale_or_metadata["out_features"]
-        in_features = scale_or_metadata["in_features"]
-        group_size = scale_or_metadata.get("group_size", 128)
-        g_idx = scale_or_metadata.get("g_idx", None)
-
-        return _dequantize_gptq(
-            qweight=qweight,
-            qzeros=qzeros,
-            scales=scales,
-            out_features=out_features,
-            in_features=in_features,
-            group_size=group_size,
-            g_idx=g_idx,
-        )
-
-    def quantize_weight_for_kernel(
-        self,
-        weight: torch.Tensor,
-        *,
-        device: torch.device | None = None,
-        **kwargs: Any,
-    ) -> tuple[torch.Tensor, Any]:
-        """GPTQ quantization is done offline, so this should not be called."""
-        raise NotImplementedError(
-            "GPTQ quantization should be done offline. "
-            "Use set_offline_quantized_weight() to load pre-quantized weights."
-        )
-
-    def quantize_act_for_kernel(
-        self,
-        x: torch.Tensor,
-        *,
-        device: torch.device | None = None,
-        **kwargs: Any,
-    ) -> tuple[torch.Tensor, Any]:
-        """No activation quantization for W4A16 (activation stays bf16)."""
-        if device is not None:
-            x = x.to(device=device)
-        return x, None
-
     def linear_forward(
         self,
         x: torch.Tensor,
@@ -330,211 +83,65 @@ def linear_forward(
         quant_kind: str,
         **kwargs: Any,
     ) -> torch.Tensor:
-        """Compute Linear output using GPTQ quantized weights (W4A16).
-
-        Args:
-            x: Activation tensor [M, K] (bf16)
-            weight: Either bf16 weight [N, K] (fallback) or GPTQ metadata dict
-            bias: Optional bias tensor [N]
-            quant_kind: Quantization kind (unused)
-            **kwargs: May include:
-                - gptq_qweight: int8 packed int4 weights [N, (K+1)//2]
-                - gptq_qzeros: int8 packed int4 zeros [num_groups, (K+1)//2]
-                - gptq_scales: float32 scales [num_groups, K]
-                - gptq_group_size: int (default: 128)
-                - gptq_g_idx: Optional int32 group indices [N]
-                - out_features: int (N)
-                - in_features: int (K)
-        """
-        _ = quant_kind
-
-        # Check if GPTQ tensors are provided directly via kwargs
-        qweight = kwargs.pop("gptq_qweight", None)
-        qzeros = kwargs.pop("gptq_qzeros", None)
-        scales = kwargs.pop("gptq_scales", None)
-        group_size = kwargs.pop("gptq_group_size", 128)
-        g_idx = kwargs.pop("gptq_g_idx", None)
-        out_features = kwargs.pop("out_features", None)
-        in_features = kwargs.pop("in_features", None)
-
-        # If GPTQ tensors are provided, use them
-        if qweight is not None and qzeros is not None and scales is not None:
-            if out_features is None or in_features is None:
-                # Infer from x shape
-                M, K = x.shape
-                if in_features is None:
-                    in_features = K
-                if out_features is None:
-                    # Infer from qweight shape
-                    out_features = qweight.shape[0]
-
-            M, K = x.shape
-            N = out_features
-            num_groups = (N + group_size - 1) // group_size
-
-            # Handle scales shape: broadcast to [num_groups, in_features] if needed
-            if scales.shape == (num_groups,):
-                scales = scales.unsqueeze(-1).expand(num_groups, in_features)
-            elif scales.shape == (num_groups, 1):
-                scales = scales.expand(num_groups, in_features)
-            elif scales.shape != (num_groups, in_features):
-                raise ValueError(
-                    f"scales shape mismatch: got {scales.shape}, "
-                    f"expected ({num_groups}, {in_features}), ({num_groups},), or ({num_groups}, 1)"
-                )
-
-            # Handle GIdx: if None, create sequential indices
-            device = qweight.device
-            if g_idx is None:
-                g_idx = torch.arange(N, device=device, dtype=torch.int32) // group_size
-            else:
-                g_idx = g_idx.to(device=device, dtype=torch.int32)
-
-            # Ensure all tensors are on the correct device
-            qweight = qweight.to(device=x.device)
-            qzeros = qzeros.to(device=x.device)
-            scales = scales.to(device=x.device, dtype=torch.float32)
-            g_idx = g_idx.to(device=x.device)
-
-            # Try to use TileLang kernel if available
-            if _TILELANG_AVAILABLE and gptq_w4a16_gemm is not None:
-                try:
-                    # Check device
-                    if x.device.type != 'cuda':
-                        return self._fallback_python_forward(
-                            x, qweight, qzeros, scales, bias,
-                            out_features=N, in_features=in_features,
-                            group_size=group_size, g_idx=g_idx,
-                        )
-
-                    # M-bucketing: reduce JIT compilation churn
-                    M_bucket = M
-                    if M > 1:
-                        if M <= 64:
-                            M_bucket = 1 << (M - 1).bit_length()
-                        else:
-                            M_bucket = ((M + 63) // 64) * 64
-
-                    x_for_kernel = x
-                    if M_bucket != M:
-                        x_pad = torch.zeros((M_bucket, K), device=x.device, dtype=x.dtype)
-                        x_pad[:M, :] = x
-                        x_for_kernel = x_pad
-
-                    # TileLang autotune: use warmup + config cache pattern
-                    cache_key = (str(x.device), M_bucket, N, K, num_groups, group_size)
-                    config = self._tl_autotune_config_cache.get(cache_key)
-                    
-                    if _AUTOTUNE_AVAILABLE and is_warming_up() and config is None:
-                        # Warmup phase: run autotune with real inputs
-                        try:
-                            with set_autotune_inputs([x_for_kernel, qweight, qzeros, scales, g_idx]):
-                                kernel = gptq_w4a16_gemm(M_bucket, N, K, num_groups, group_size)
-                            config = kernel.config
-                            self._tl_autotune_config_cache[cache_key] = config
-                        except Exception:
-                            # Fallback to default config if autotune fails
-                            config = None
-                    
-                    # Use cached config or default parameters
-                    if config is not None:
-                        kernel = gptq_w4a16_gemm(M_bucket, N, K, num_groups, group_size, **config)
-                    else:
-                        # Default config (backward compatible)
-                        kernel = gptq_w4a16_gemm(M_bucket, N, K, num_groups, group_size, block_M=64, block_N=64, block_K=128, num_stages=2, threads=128)
-
-                    # Call kernel - out_idx=[5] means output is the 6th parameter
-                    output_full = kernel(x_for_kernel, qweight, qzeros, scales, g_idx)
-                    output = output_full[:M, :] if M_bucket != M else output_full
+        _ = quant_kind, weight
+        if ops is None:
+            raise RuntimeError(
+                "vLLM is required for GPTQ W4A16 (missing `vllm._custom_ops`). "
+                "Please install/build vLLM with CUDA ops."
+            )
 
-                    # Add bias if present
-                    if bias is not None:
-                        output = output + bias
+        qweight = kwargs.get("gptq_qweight", None)
+        qzeros = kwargs.get("gptq_qzeros", None)
+        scales = kwargs.get("gptq_scales", None)
+        g_idx = kwargs.get("gptq_g_idx", None)
 
-                    return output
-                except Exception as e:
-                    # Fallback to Python implementation on any error
-                    import warnings
-                    error_msg = str(e)
+        if qweight is None or qzeros is None or scales is None:
+            return F.linear(x, weight, bias)
 
-                    # Extract meaningful error information
-                    if 'sm_' in error_msg and ('not defined' in error_msg or 'fatal' in error_msg):
-                        # CUDA architecture not supported - silently fallback
-                        pass
-                    elif 'Compilation error' in error_msg:
-                        # Extract the actual error
-                        idx = error_msg.find('Compilation error')
-                        after = error_msg[idx + len('Compilation error'):]
-                        lines = after.split('\n')
-                        for line in lines:
-                            line = line.strip()
-                            if line and not line.startswith('#') and ('error:' in line.lower() or 'fatal' in line.lower()):
-                                error_msg = f"CUDA compilation error: {line[:200]}"
-                                break
-                        else:
-                            error_msg = "CUDA compilation error (see logs for details)"
-                        warnings.warn(
-                            f"TileLang GPTQ kernel failed, falling back to Python implementation: {error_msg}",
-                            UserWarning,
-                        )
-                    elif 'pipeline' in error_msg.lower() and 'stage' in error_msg.lower():
-                        # Pipeline stages mismatch - silently fallback
-                        pass
-                    else:
-                        # Warn for unexpected errors
-                        if len(error_msg) > 200:
-                            error_msg = error_msg[:200] + "..."
-                        warnings.warn(
-                            f"TileLang GPTQ kernel failed, falling back to Python implementation: {error_msg}",
-                            UserWarning,
-                        )
-                    return self._fallback_python_forward(
-                        x, qweight, qzeros, scales, bias,
-                        out_features=N, in_features=in_features,
-                        group_size=group_size, g_idx=g_idx,
-                    )
-            else:
-                # TileLang not available, use Python fallback
-                return self._fallback_python_forward(
-                    x, qweight, qzeros, scales, bias,
-                    out_features=N, in_features=in_features,
-                    group_size=group_size, g_idx=g_idx,
-                )
+        use_v2_format = bool(kwargs.get("gptq_use_v2_format", False))
 
-        # Fallback: if weight is a regular bf16 tensor, use it directly
-        if isinstance(weight, torch.Tensor) and weight.dtype == torch.bfloat16:
-            return F.linear(x, weight, bias)
+        # Infer weight_bits from packed shapes to support GPTQ W2/W4/W8.
+        # qzeros: [K/group, N/pack_factor] and qweight: [K/pack_factor, N]
+        if qzeros.shape[1] <= 0 or qweight.shape[1] % int(qzeros.shape[1]) != 0:
+            raise RuntimeError(
+                f"Invalid GPTQ packed shapes: qweight.shape={tuple(qweight.shape)}, "
+                f"qzeros.shape={tuple(qzeros.shape)}"
+            )
+        pack_factor = int(qweight.shape[1]) // int(qzeros.shape[1])
+        if 32 % pack_factor != 0:
+            raise RuntimeError(
+                f"Unsupported GPTQ pack_factor={pack_factor} (requires 32%pack_factor==0). "
+                f"qweight.shape={tuple(qweight.shape)}, qzeros.shape={tuple(qzeros.shape)}"
+            )
+        weight_bits = 32 // pack_factor
 
-        raise ValueError(
-            "GPTQ strategy requires gptq_qweight, gptq_qzeros, and gptq_scales to be provided "
-            "via kwargs or weight must be a bf16 tensor (fallback mode)"
-        )
+        # vLLM GPTQ kernels expect FP16 activations.
+        x_in = x.to(dtype=torch.float16) if x.dtype != torch.float16 else x
+        qweight = qweight.to(device=x.device, dtype=torch.int32)
+        qzeros = qzeros.to(device=x.device, dtype=torch.int32)
+        scales = scales.to(device=x.device, dtype=torch.float16)
 
-    def _fallback_python_forward(
-        self,
-        x: torch.Tensor,
-        qweight: torch.Tensor,
-        qzeros: torch.Tensor,
-        scales: torch.Tensor,
-        bias: Optional[torch.Tensor],
-        *,
-        out_features: int,
-        in_features: int,
-        group_size: int,
-        g_idx: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Fallback Python implementation: dequantize + F.linear."""
-        dequant_weight = _dequantize_gptq(
-            qweight=qweight.to(device=x.device),
-            qzeros=qzeros.to(device=x.device),
-            scales=scales.to(device=x.device),
-            out_features=out_features,
-            in_features=in_features,
-            group_size=group_size,
-            g_idx=g_idx.to(device=x.device) if g_idx is not None else None,
+        if g_idx is None or (isinstance(g_idx, torch.Tensor) and g_idx.numel() == 0):
+            g_idx_t = torch.empty((0,), device=x.device, dtype=torch.int)
+        else:
+            g_idx_t = g_idx.to(device=x.device, dtype=torch.int)
+
+        out_shape = x.shape[:-1] + (qweight.shape[-1],)
+        reshaped_x = x_in.reshape(-1, x_in.shape[-1])
+
+        output = ops.gptq_gemm(
+            reshaped_x,
+            qweight,
+            qzeros,
+            scales,
+            g_idx_t,
+            True,  # use_exllama (vLLM shuffles weights into exllama-friendly layout)
+            use_v2_format,
+            weight_bits,
         )
-        return F.linear(x, dequant_weight, bias)
+        if bias is not None:
+            output.add_(bias.to(dtype=output.dtype))
+        output = output.reshape(out_shape)
+        # Keep output dtype consistent with input activations for downstream layers.
+        return output.to(dtype=x.dtype) if output.dtype != x.dtype else output
 
-    def clear_cache(self) -> None:
-        """Clear cache (no-op, kept for compatibility)."""
-        pass
diff --git a/diffulex/utils/quantization/strategies/linear_int4_w4a16.py b/diffulex/utils/quantization/strategies/linear_int4_w4a16.py
index 9141437..e1b085e 100644
--- a/diffulex/utils/quantization/strategies/linear_int4_w4a16.py
+++ b/diffulex/utils/quantization/strategies/linear_int4_w4a16.py
@@ -1,41 +1,25 @@
 """
-W4A16 Linear quantization strategy (int4 weight + bf16 activation).
+W4A16 Linear quantization strategy (int4 weight + bf16 activation), TileLang-free.
 
-Reference implementation using Python dequantization + torch.nn.functional.linear.
-Int4 weights are packed into int8 (2 int4 values per int8 byte).
+vLLM-aligned behavior:
+- vLLM 在 sm89（如 4090）上并没有“在线 int4 -> 快 GEMM”的通用路径；
+  真正的 int4 加速通常依赖 GPTQ/AWQ 的 marlin/cutlass 以及对应的离线权重格式。
+- 为避免“看起来是 int4 但实际在跑 bf16 GEMM”，默认禁止静默走 `F.linear` 慢路径。
 
-Future optimizations:
-- Replace F.linear with custom Triton/TileLang kernel for int4 GEMM
+如需临时允许 correctness-first 慢 fallback，可设置环境变量：
+  `DIFFULEX_ALLOW_SLOW_QUANT_FALLBACK=1`
 """
 
 from __future__ import annotations
 
 from typing import Any, Optional
 
-import os
 import torch
 import torch.nn.functional as F
 
 from diffulex.utils.quantization.registry import register_linear_strategy
 from diffulex.utils.quantization.strategy import LinearQuantizationStrategy
 
-# Try to import TileLang kernel, fallback to None if not available
-try:
-    from diffulex_kernel.python.linear_kernels import w4a16_gemm
-    _TILELANG_AVAILABLE = True
-except ImportError:
-    _TILELANG_AVAILABLE = False
-    w4a16_gemm = None
-
-try:
-    from diffulex.attention.metadata import is_warming_up
-    from tilelang.autotuner import set_autotune_inputs
-    _AUTOTUNE_AVAILABLE = True
-except ImportError:
-    _AUTOTUNE_AVAILABLE = False
-    is_warming_up = lambda: False
-    set_autotune_inputs = lambda *args, **kwargs: lambda f: f
-
 
 @register_linear_strategy(weight_dtype="int4", act_dtype="bf16")
 def _build_linear_int4_w4a16() -> LinearQuantizationStrategy:
@@ -43,29 +27,10 @@ def _build_linear_int4_w4a16() -> LinearQuantizationStrategy:
 
 
 class LinearInt4W4A16Strategy(LinearQuantizationStrategy):
-    """W4A16 Linear strategy: int4 weight quantization + bf16 activation.
-
-    Current implementation: Python reference using dequantized weights + F.linear.
-    Weight quantization: per-output-channel symmetric quantization to int4.
-    Activation: kept as bf16 (no activation quantization).
-    
-    Int4 packing: Each int8 byte stores 2 int4 values (lower 4 bits and upper 4 bits).
-    Packed weight shape: [out_features, (in_features + 1) // 2] (int8)
-    
-    Lazy cache: Quantized weights are cached per weight tensor (by id) to avoid
-    re-quantizing on every forward pass.
-    """
-    
-    def __init__(self):
-        """Initialize strategy with empty weight cache."""
+    def __init__(self) -> None:
         super().__init__()
-        # Cache: weight_id -> (packed_weight_int8, scales)
-        # Using id(weight) as key since the same Parameter object is reused across forwards
+        # Cache: id(weight) -> (packed_int8 [N, ceil(K/2)], scales_fp32 [N])
         self._weight_cache: dict[int, tuple[torch.Tensor, torch.Tensor]] = {}
-        # Optional cache: weight_id -> bf16 dequantized weight (speed-first; uses extra memory)
-        self._dequant_weight_cache: dict[int, torch.Tensor] = {}
-        # TileLang autotune config cache: (device, M_bucket, N, K) -> config dict
-        self._tl_autotune_config_cache: dict[tuple[str, int, int, int], dict] = {}
 
     @property
     def name(self) -> str:
@@ -80,196 +45,60 @@ def linear_act_format(self) -> str:
         return "bf16"
 
     def get_storage_dtype(self) -> tuple[torch.dtype, int]:
-        # Weights are stored as int8 (1 byte per element), but each byte contains 2 int4 values
-        # So effective storage is 0.5 bytes per int4 weight element
-        return torch.int8, 1  # Physical storage is int8, but logical is int4
+        return torch.int8, 1
+
+    def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs: Any) -> tuple[int, ...]:
+        _ = kwargs
+        if len(original_shape) != 2:
+            raise ValueError(f"Expected 2D weight [N,K], got {original_shape}")
+        return (original_shape[0],)
 
     @staticmethod
     def _pack_int4_to_int8(int4_tensor: torch.Tensor) -> torch.Tensor:
-        """Pack int4 tensor into int8 format.
-        
-        Args:
-            int4_tensor: int8 tensor with values in range [-8, 7] (representing int4)
-                        shape: [out_features, in_features]
-        
-        Returns:
-            Packed int8 tensor, shape: [out_features, (in_features + 1) // 2]
-            Each int8 byte contains 2 int4 values: lower 4 bits (first) and upper 4 bits (second)
-        """
-        out_features, in_features = int4_tensor.shape
-        
-        # Clamp to int4 range [-8, 7]
-        int4_tensor = int4_tensor.clamp(-8, 7)
-        
-        # Convert to uint8 for easier bit manipulation
-        # Map [-8, 7] to [0, 15] by adding 8
-        uint8_tensor = (int4_tensor + 8).to(torch.uint8)
-        
-        # Pad in_features to even number if needed
-        if in_features % 2 != 0:
-            # Pad with zeros (value 8 in uint8, which represents 0 in int4)
-            pad_size = 1
-            padding = torch.zeros(out_features, pad_size, dtype=torch.uint8, device=uint8_tensor.device) + 8
-            uint8_tensor = torch.cat([uint8_tensor, padding], dim=1)
-            padded_in_features = in_features + pad_size
-        else:
-            padded_in_features = in_features
-        
-        # Reshape to [out_features, in_features // 2, 2]
-        reshaped = uint8_tensor.view(out_features, padded_in_features // 2, 2)
-        
-        # Pack: first element in lower 4 bits, second element in upper 4 bits
-        # packed[i, j] = reshaped[i, j, 0] | (reshaped[i, j, 1] << 4)
-        packed = reshaped[:, :, 0] | (reshaped[:, :, 1] << 4)
-        
-        # Convert back to int8
-        return packed.to(torch.int8)
+        # int4_tensor: int8 [N,K] values in [-8,7]
+        n, k = int4_tensor.shape
+        t = int4_tensor.clamp(-8, 7).to(torch.int16)
+        u = (t + 8).to(torch.uint8)  # [0,15]
+        if k % 2 != 0:
+            u = torch.cat([u, torch.full((n, 1), 8, device=u.device, dtype=torch.uint8)], dim=1)
+            k = k + 1
+        u2 = u.view(n, k // 2, 2)
+        packed = (u2[:, :, 0] | (u2[:, :, 1] << 4)).to(torch.int8)
+        return packed.contiguous()
 
     @staticmethod
-    def _unpack_int8_to_int4(packed_int8: torch.Tensor, original_in_features: int) -> torch.Tensor:
-        """Unpack int8 tensor back to int4 format.
-        
-        Args:
-            packed_int8: Packed int8 tensor, shape: [out_features, packed_size]
-            original_in_features: Original in_features dimension (before padding)
-        
-        Returns:
-            Unpacked int4 tensor (as int8 with values in range [-8, 7]), shape: [out_features, original_in_features]
-        """
-        out_features, packed_size = packed_int8.shape
-        
-        # Convert to uint8 for bit manipulation
-        uint8_packed = packed_int8.to(torch.uint8)
-        
-        # Extract lower and upper 4 bits
-        lower = uint8_packed & 0x0F  # Lower 4 bits
-        upper = (uint8_packed >> 4) & 0x0F  # Upper 4 bits
-        
-        # Stack: [out_features, packed_size, 2]
-        unpacked_uint8 = torch.stack([lower, upper], dim=-1)
-        
-        # Reshape to [out_features, packed_size * 2]
-        unpacked_uint8 = unpacked_uint8.view(out_features, packed_size * 2)
-        
-        # Slice to original size (remove padding if any)
-        unpacked_uint8 = unpacked_uint8[:, :original_in_features]
-        
-        # Convert back to int4 range: [0, 15] -> [-8, 7]
-        unpacked_int4 = unpacked_uint8.to(torch.int8) - 8
-        
-        return unpacked_int4
-
-    def quantize(self, tensor: torch.Tensor, **kwargs) -> tuple[torch.Tensor, Any]:
-        """Quantize tensor to int4 with per-channel (per-output) scales.
-        
-        Args:
-            tensor: Weight tensor of shape [out_features, in_features]
-            **kwargs: Additional arguments (unused for now)
-        
-        Returns:
-            (packed_weight_int8, scales): 
-            - packed_weight_int8: int8 tensor shape [out_features, (in_features + 1) // 2]
-            - scales: [out_features]
-        """
-        _ = kwargs
-        # Per-output-channel quantization: compute scale for each output channel
-        # shape: [out_features, in_features] -> scales shape: [out_features]
-        abs_max = torch.abs(tensor).max(dim=-1, keepdim=True)[0]  # [out_features, 1]
-        # Avoid division by zero
-        scales = abs_max.clamp(min=1e-8) / 7.0  # [out_features, 1] (int4 range is -8 to 7, so max abs is 7)
-        
-        # Quantize: round(clamp(tensor / scales, -8, 7))
-        quantized_int4 = torch.round(tensor / scales).clamp(-8, 7).to(torch.int8)
-        scales_1d = scales.squeeze(-1)  # [out_features]
-        
-        # Pack int4 into int8
-        packed_weight = self._pack_int4_to_int8(quantized_int4)
-        
-        return packed_weight, scales_1d
-
-    def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) -> torch.Tensor:
-        """Dequantize packed int4 tensor back to bf16 using per-channel scales.
-        
-        Args:
-            quantized: Packed int8 tensor [out_features, packed_size]
-            scale_or_metadata: scales tensor [out_features] or dict with 'scales' and 'original_in_features'
-            **kwargs: Additional arguments, may include 'original_in_features'
-        
-        Returns:
-            Dequantized tensor in bf16, shape [out_features, original_in_features]
-        """
+    def _unpack_int8_to_int4(packed: torch.Tensor, *, original_k: int) -> torch.Tensor:
+        # packed: int8 [N, ceil(K/2)] (two nibbles per byte)
+        p = packed.view(torch.uint8)
+        low = (p & 0x0F).to(torch.int16) - 8
+        high = ((p >> 4) & 0x0F).to(torch.int16) - 8
+        n, pk = packed.shape
+        out = torch.empty((n, pk * 2), device=packed.device, dtype=torch.int16)
+        out[:, 0::2] = low
+        out[:, 1::2] = high
+        return out[:, :original_k].to(torch.int8).contiguous()
+
+    def quantize(self, tensor: torch.Tensor, **kwargs: Any) -> tuple[torch.Tensor, Any]:
         _ = kwargs
-        if isinstance(scale_or_metadata, dict):
-            scales = scale_or_metadata.get("scales")
-            original_in_features = scale_or_metadata.get("original_in_features")
-        else:
-            scales = scale_or_metadata
-            # Try to infer original_in_features from quantized shape
-            # packed_size = (in_features + 1) // 2, so in_features = packed_size * 2 or packed_size * 2 - 1
-            packed_size = quantized.shape[1]
-            # We'll use the maximum possible (packed_size * 2), caller should provide original_in_features if needed
-            original_in_features = packed_size * 2
-        
+        if tensor.dim() != 2:
+            raise ValueError(f"Expected 2D weight [N,K], got shape={tuple(tensor.shape)}")
+        w = tensor.to(torch.bfloat16)
+        abs_max = w.abs().amax(dim=-1, keepdim=True)  # [N,1]
+        scales = (abs_max.clamp(min=1e-8) / 7.0).to(torch.float32).squeeze(-1)  # [N]
+        q = torch.round(w.to(torch.float32) / scales.unsqueeze(-1)).clamp(-8, 7).to(torch.int8)
+        packed = self._pack_int4_to_int8(q)
+        return packed, {"scales": scales}
+
+    def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs: Any) -> torch.Tensor:
+        original_k = int(kwargs.get("original_in_features", 0))
+        if original_k <= 0:
+            raise ValueError("original_in_features is required to dequantize int4 weights")
+        scales = scale_or_metadata.get("scales") if isinstance(scale_or_metadata, dict) else scale_or_metadata
         if scales is None:
             raise ValueError("scales required for dequantization")
-        
-        # Get original_in_features from kwargs if provided
-        original_in_features = kwargs.get("original_in_features", original_in_features)
-        
-        # Unpack int4 from int8
-        unpacked_int4 = self._unpack_int8_to_int4(quantized, original_in_features)
-        
-        # Ensure scales have correct shape for broadcasting
-        if scales.dim() == 1:
-            scales = scales.unsqueeze(-1)  # [out_features, 1]
-        
-        # Dequantize: quantized * scales
-        dequantized = unpacked_int4.to(torch.float32) * scales
-        return dequantized.to(torch.bfloat16)
-
-    def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs) -> tuple[int, ...]:
-        """Return shape of scales tensor for per-channel quantization.
-        
-        For [out_features, in_features] weight, scales shape is [out_features].
-        """
-        _ = kwargs
-        if len(original_shape) < 2:
-            raise ValueError(f"Expected weight shape with at least 2 dims, got {original_shape}")
-        # Per-output-channel: scales shape is [out_features]
-        return (original_shape[0],)
-
-    def quantize_weight_for_kernel(
-        self,
-        weight: torch.Tensor,
-        *,
-        device: torch.device | None = None,
-        **kwargs: Any,
-    ) -> tuple[torch.Tensor, Any]:
-        """Quantize weight to int4 (packed as int8) with per-channel scales.
-        
-        Returns:
-            (packed_weight_int8, scales): 
-            - packed_weight_int8: int8 [out, (in + 1) // 2]
-            - scales: [out]
-        """
-        _ = kwargs
-        if device is not None:
-            weight = weight.to(device=device)
-        
-        packed_weight, scales = self.quantize(weight)
-        return packed_weight, scales
-
-    def quantize_act_for_kernel(
-        self,
-        x: torch.Tensor,
-        *,
-        device: torch.device | None = None,
-        **kwargs: Any,
-    ) -> tuple[torch.Tensor, Any]:
-        """No activation quantization for W4A16 (activation stays bf16)."""
-        if device is not None:
-            x = x.to(device=device)
-        return x, None
+        q = self._unpack_int8_to_int4(quantized, original_k=original_k).to(torch.float32)
+        w = q * scales.to(torch.float32).unsqueeze(-1)
+        return w.to(torch.bfloat16)
 
     def linear_forward(
         self,
@@ -280,241 +109,31 @@ def linear_forward(
         quant_kind: str,
         **kwargs: Any,
     ) -> torch.Tensor:
-        """Compute Linear output using quantized weights (W4A16).
-        
-        Uses Python reference implementation (dequant + F.linear).
-        Future: Replace with TileLang kernel for int4 GEMM.
-        
-        Args:
-            x: Activation tensor [M, K] (bf16)
-            weight: Either bf16 weight [N, K] or packed int8 weight [N, (K + 1) // 2]
-            bias: Optional bias tensor [N]
-            quant_kind: Quantization kind (unused)
-            **kwargs: May include quant_scales and original_in_features for load-time quantized weights
-        """
         _ = quant_kind
-
-        # If caller provides a pre-quantized packed int8 weight + scales (e.g., load-time quantized module),
-        # use them directly and DO NOT populate the lazy cache (to avoid double-storage).
-        quant_scales = kwargs.pop("quant_scales", None)
-        original_in_features = kwargs.pop("original_in_features", None)
-        
-        if weight.dtype == torch.int8:
-            if quant_scales is None:
-                raise ValueError("weight is int8 (packed int4) but quant_scales is None; expected per-channel scales tensor")
-            # We have activation K; that's the real in_features for this matmul.
-            # Using packed_size*2 is fragile (it breaks if the int4 weights are stored "unpacked" as int8[N, K]).
-            M, K = x.shape
-            if original_in_features is None:
-                original_in_features = K
-
-            # Accept both representations:
-            # - packed int4: int8[N, (K+1)//2] where each byte holds 2 int4
-            # - unpacked int4: int8[N, K] where each element is an int4 value stored in int8
-            expected_packed_K = (K + 1) // 2
-            if weight.shape[1] == expected_packed_K:
-                packed_weight = weight
-            elif weight.shape[1] == K:
-                # Unpacked int4 -> pack on-the-fly so we can use the same kernel path.
-                # Support both [-8, 7] (signed int4) and [0, 15] (uint4 stored in int8).
-                w = weight
-                if (w.min() >= 0) and (w.max() <= 15):
-                    w = (w.to(torch.int16) - 8).to(torch.int8)
-                packed_weight = self._pack_int4_to_int8(w)
-            else:
-                raise ValueError(
-                    f"Unexpected int4 weight shape for int8 weight: got {tuple(weight.shape)}, "
-                    f"expected (N,{expected_packed_K}) for packed or (N,{K}) for unpacked."
-                )
-            scales = quant_scales
-            if scales.dtype != torch.bfloat16:
-                scales = scales.to(dtype=torch.bfloat16)
-            if packed_weight.device != x.device:
-                packed_weight = packed_weight.to(device=x.device)
-            if scales.device != x.device:
-                scales = scales.to(device=x.device)
+        if not bool(int(__import__("os").environ.get("DIFFULEX_ALLOW_SLOW_QUANT_FALLBACK", "0"))):
+            raise RuntimeError(
+                "当前平台/配置下 `int4` 在线量化没有可用的 vLLM 快 kernel（例如 4090/sm89 无 CUTLASS W4A8）。"
+                "为避免静默退化到 bf16 GEMM，已禁止 `F.linear` 慢 fallback。"
+                "请改用 `gptq/awq`（vLLM 标准打包格式）或设置 DIFFULEX_ALLOW_SLOW_QUANT_FALLBACK=1 临时开启。"
+            )
+        original_k = int(kwargs.get("original_in_features", x.shape[-1]))
+        quant_scales = kwargs.get("quant_scales", None)
+
+        if weight is not None and weight.dtype == torch.int8 and quant_scales is not None:
+            packed = weight.to(device=x.device)
+            scales = quant_scales.to(device=x.device, dtype=torch.float32)
         else:
-            # Lazy cache: use weight tensor id as key (only for bf16/fp16 weights)
-            weight_id = id(weight)
-
-            # Check cache
-            if weight_id in self._weight_cache:
-                packed_weight, scales = self._weight_cache[weight_id]
-                # Ensure cached tensors are on the correct device
-                if packed_weight.device != x.device:
-                    packed_weight = packed_weight.to(device=x.device)
-                    scales = scales.to(device=x.device)
-                # Get original_in_features from cached metadata or infer
-                if original_in_features is None:
-                    # Infer: packed_size = (in_features + 1) // 2
-                    packed_size = packed_weight.shape[1]
-                    original_in_features = packed_size * 2
+            wid = id(weight)
+            cached = self._weight_cache.get(wid)
+            if cached is None or cached[0].device != x.device:
+                packed, meta = self.quantize(weight)
+                packed = packed.to(device=x.device)
+                scales = meta["scales"].to(device=x.device, dtype=torch.float32)
+                self._weight_cache[wid] = (packed, scales)
             else:
-                # Quantize weight and cache it
-                packed_weight, scales = self.quantize_weight_for_kernel(weight, device=x.device)
-                # Cache the packed weight and scales
-                self._weight_cache[weight_id] = (packed_weight, scales)
-                # Store original_in_features for later use
-                original_in_features = weight.shape[1]
-
-        # Speed-first option:
-        # If enabled, dequantize once and reuse a cached bf16 weight for F.linear (cuBLAS).
-        # This trades extra GPU memory for throughput.
-        if os.getenv("DIFFULEX_W4A16_PREFER_CUBLAS", "0") == "1":
-            deq_key = id(weight)
-            deq_w = self._dequant_weight_cache.get(deq_key)
-            if deq_w is None or deq_w.device != x.device:
-                deq_w = self.dequantize(
-                    packed_weight,
-                    scales,
-                    original_in_features=original_in_features,
-                )
-                if deq_w.device != x.device:
-                    deq_w = deq_w.to(device=x.device)
-                self._dequant_weight_cache[deq_key] = deq_w
-            return F.linear(x, deq_w, bias)
-        
-        # Try to use TileLang kernel if available
-        if _TILELANG_AVAILABLE and w4a16_gemm is not None:
-            try:
-                # Check device
-                if x.device.type != 'cuda':
-                    return self._fallback_python_forward(x, packed_weight, scales, bias, original_in_features=original_in_features)
-                
-                # Check CUDA compute capability (skip kernel if unsupported)
-                try:
-                    if torch.cuda.is_available():
-                        props = torch.cuda.get_device_properties(x.device.index or 0)
-                        compute_cap = (props.major, props.minor)
-                        # Let TileLang handle the check and fallback gracefully
-                        pass
-                except Exception:
-                    # If we can't check compute capability, still try the kernel
-                    pass
-                
-                # Get shapes
-                M, K = x.shape
-                N, packed_K = packed_weight.shape
-                # Verify packed_K matches expected packed size for K
-                expected_packed_K = (original_in_features + 1) // 2
-                assert packed_K == expected_packed_K, f"Packed K dimension mismatch: {packed_K} != {expected_packed_K}"
-                
-                # Reduce TileLang JIT compilation churn without killing small-M decode performance.
-                # Previous logic padded *any* M!=1 to 64/128/256, which can turn decode M=2/4 into M=64.
-                # We instead bucket to a small stable set:
-                # - for M<=64: next power-of-two (2,4,8,16,32,64)
-                # - for M>64: round up to a multiple of 64
-                M_bucket = M
-                if M > 1:
-                    if M <= 64:
-                        M_bucket = 1 << (M - 1).bit_length()
-                    else:
-                        M_bucket = ((M + 63) // 64) * 64
-
-                x_for_kernel = x
-                if M_bucket != M:
-                    x_pad = torch.zeros((M_bucket, K), device=x.device, dtype=x.dtype)
-                    x_pad[:M, :] = x
-                    x_for_kernel = x_pad
+                packed, scales = cached
 
-                # TileLang autotune: use warmup + config cache pattern
-                cache_key = (str(x.device), M_bucket, N, K)
-                config = self._tl_autotune_config_cache.get(cache_key)
-                
-                if _AUTOTUNE_AVAILABLE and is_warming_up() and config is None:
-                    # Warmup phase: run autotune with real inputs
-                    try:
-                        with set_autotune_inputs([x_for_kernel, packed_weight, scales]):
-                            kernel = w4a16_gemm(M_bucket, N, K)
-                        config = kernel.config
-                        self._tl_autotune_config_cache[cache_key] = config
-                    except Exception:
-                        # Fallback to default config if autotune fails
-                        config = None
-                
-                # Use cached config or default parameters
-                if config is not None:
-                    kernel = w4a16_gemm(M_bucket, N, K, **config)
-                else:
-                    # Default config (backward compatible)
-                    kernel = w4a16_gemm(M_bucket, N, K, block_M=64, block_N=64, block_K=128, num_stages=2, threads=128)
-                
-                # Call kernel - out_idx=[3] means output is the 4th parameter,
-                # so we only pass inputs (x, packed_weight, scales), and kernel returns output
-                output_full = kernel(x_for_kernel, packed_weight, scales)
-                output = output_full[:M, :] if M_bucket != M else output_full
-                
-                # Add bias if present
-                if bias is not None:
-                    output = output + bias
-                
-                return output
-            except Exception as e:
-                # Fallback to Python implementation on any error
-                import warnings
-                error_msg = str(e)
-                
-                # Extract meaningful error information
-                if 'sm_' in error_msg and ('not defined' in error_msg or 'fatal' in error_msg):
-                    # CUDA architecture not supported - silently fallback
-                    pass
-                elif 'Compilation error' in error_msg:
-                    # Extract the actual error
-                    idx = error_msg.find('Compilation error')
-                    after = error_msg[idx + len('Compilation error'):]
-                    lines = after.split('\n')
-                    for line in lines:
-                        line = line.strip()
-                        if line and not line.startswith('#') and ('error:' in line.lower() or 'fatal' in line.lower()):
-                            error_msg = f"CUDA compilation error: {line[:200]}"
-                            break
-                    else:
-                        error_msg = "CUDA compilation error (see logs for details)"
-                    warnings.warn(
-                        f"TileLang W4A16 kernel failed, falling back to Python implementation: {error_msg}",
-                        UserWarning,
-                    )
-                elif 'pipeline' in error_msg.lower() and 'stage' in error_msg.lower():
-                    # Pipeline stages mismatch - silently fallback
-                    pass
-                else:
-                    # Warn for unexpected errors
-                    if len(error_msg) > 200:
-                        error_msg = error_msg[:200] + "..."
-                    warnings.warn(
-                        f"TileLang W4A16 kernel failed, falling back to Python implementation: {error_msg}",
-                        UserWarning,
-                    )
-                return self._fallback_python_forward(x, packed_weight, scales, bias, original_in_features=original_in_features)
-        else:
-            # TileLang not available, use Python reference
-            return self._fallback_python_forward(x, packed_weight, scales, bias, original_in_features=original_in_features)
-    
-    def _fallback_python_forward(
-        self,
-        x: torch.Tensor,
-        packed_weight: torch.Tensor,
-        scales: torch.Tensor,
-        bias: Optional[torch.Tensor],
-        *,
-        original_in_features: int,
-    ) -> torch.Tensor:
-        """Fallback Python implementation: unpack + dequantize + F.linear."""
-        # Unpack and dequantize
-        dequantized_weight = self.dequantize(
-            packed_weight, 
-            scales, 
-            original_in_features=original_in_features
-        )
-        
-        # Compute linear output
-        return F.linear(x, dequantized_weight, bias)
-    
-    def clear_cache(self) -> None:
-        """Clear the weight quantization cache.
-        
-        Useful for memory management or when weights are updated (e.g., fine-tuning).
-        """
-        self._weight_cache.clear()
-        self._dequant_weight_cache.clear()
+        # Slow fallback (explicitly opted-in).
+        w_deq = self.dequantize(packed, {"scales": scales}, original_in_features=original_k)
+        return F.linear(x, w_deq, bias)
 
diff --git a/diffulex/utils/quantization/strategies/linear_int4_w4a8.py b/diffulex/utils/quantization/strategies/linear_int4_w4a8.py
index f2287e0..decb19d 100644
--- a/diffulex/utils/quantization/strategies/linear_int4_w4a8.py
+++ b/diffulex/utils/quantization/strategies/linear_int4_w4a8.py
@@ -1,145 +1,25 @@
 """
-W4A8 Linear quantization strategy (int4 weight + int8 activation).
+W4A8 Linear quantization strategy (int4 weight + int8 activation), TileLang-free.
 
-Notes:
-- Weight is per-output-channel symmetric int4 packed into int8 (2 values per byte), with per-channel scales.
-- Activation is quantized per-row to int8 with per-row scales.
-- GEMM is performed by unpacking int4 -> int8 and using `torch._int_mm` (int8 x int8 -> int32).
-  For now we cache the unpacked (and transposed) weight to avoid repeated unpack.
-- If int8 GEMM is not available, we fall back to unpack+dequant BF16 + cuBLAS (F.linear).
+vLLM-aligned behavior:
+- vLLM 的 CUTLASS W4A8 kernel 需要 sm90（Hopper）；在 sm89（如 4090）上不可用。
+- 为避免静默退化到 bf16 GEMM，默认禁止 `F.linear` 慢 fallback。
+
+如需临时允许 correctness-first 慢 fallback，可设置：
+  `DIFFULEX_ALLOW_SLOW_QUANT_FALLBACK=1`
 """
 
 from __future__ import annotations
 
 from typing import Any, Optional
 
-import os
-import warnings
-
 import torch
 import torch.nn.functional as F
 
-from diffulex.attention.metadata import is_warming_up
 from diffulex.utils.quantization.registry import register_linear_strategy
 from diffulex.utils.quantization.strategy import LinearQuantizationStrategy
 
-try:
-    from diffulex_kernel.python.linear_kernels import (
-        w4a8_gemm,
-        w4a8_scaled_gemm,
-        w4a8_fused_act_gemm,
-        w8a8_act_quant,
-    )
-    _TILELANG_AVAILABLE = True
-except ImportError:
-    _TILELANG_AVAILABLE = False
-    w4a8_gemm = None
-    w4a8_scaled_gemm = None
-    w8a8_act_quant = None
-    w4a8_fused_act_gemm = None
-
-try:
-    # Optional: only needed for TileLang autotune warmup.
-    from tilelang.autotuner import set_autotune_inputs  # type: ignore
-except Exception:
-    set_autotune_inputs = None
-
-
-_DEFAULT_TL_LINEAR_CFG: dict[str, Any] = {
-    "block_M": 64,
-    "block_N": 64,
-    "block_K": 128,
-    "num_stages": 2,
-    "threads": 128,
-}
-
-
-def _quantize_per_row_int8_torch(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-    abs_max = x.abs().amax(dim=-1, keepdim=False)  # [M]
-    scales = (abs_max.clamp(min=1e-8) / 127.0).to(torch.float32)  # [M]
-    x_q = torch.round(x.to(torch.float32) / scales.unsqueeze(-1)).clamp(-127, 127).to(torch.int8)
-    return x_q, scales
-
-
-def _quantize_per_row_int8(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-    """Per-row symmetric int8 quantization with optional TileLang fused kernel.
-
-    Default: use TileLang fused kernel if available, otherwise fall back to torch ops.
-
-    Env:
-        - DIFFULEX_W4A8_USE_TL_ACT_QUANT=0 to force torch fallback.
-    """
-    use_tl = os.getenv("DIFFULEX_W4A8_USE_TL_ACT_QUANT", "1") == "1"
-    if (
-        use_tl
-        and _TILELANG_AVAILABLE
-        and (w8a8_act_quant is not None)
-        and x.is_cuda
-        and x.dtype == torch.bfloat16
-        and x.is_contiguous()
-        and x.dim() == 2
-    ):
-        m, k = x.shape
-        if m <= 16:
-            block_m = 16
-        elif m <= 32:
-            block_m = 32
-        else:
-            block_m = 64
-        try:
-            kernel = w8a8_act_quant(
-                m,
-                k,
-                block_M=block_m,
-                block_K=256,
-                threads=128,
-            )
-            x_q, scales = kernel(x)
-            return x_q, scales
-        except Exception:
-            pass
-    return _quantize_per_row_int8_torch(x)
-
-
-def _int8_mm(a_int8: torch.Tensor, b_int8: torch.Tensor) -> torch.Tensor:
-    if hasattr(torch, "_int_mm"):
-        return torch._int_mm(a_int8, b_int8)
-    if hasattr(torch.ops.aten, "_int_mm"):
-        return torch.ops.aten._int_mm(a_int8, b_int8)
-    raise RuntimeError("No int8 GEMM backend found (torch._int_mm / aten._int_mm missing)")
-
-
-def _unpack_int4_packed_int8(packed: torch.Tensor, *, original_in_features: int) -> torch.Tensor:
-    """Unpack int4 weights stored in int8 bytes (2 nibbles per byte) into int8 values in [-8, 7].
-
-    Args:
-        packed: int8 [N, ceil(K/2)]
-        original_in_features: K
-    Returns:
-        unpacked: int8 [N, K]
-    """
-    if packed.dtype != torch.int8:
-        raise TypeError(f"packed weight must be int8, got {packed.dtype}")
-    N, packed_K = packed.shape
-    expected = (original_in_features + 1) // 2
-    if packed_K != expected:
-        raise ValueError(f"Packed K mismatch: got {packed_K}, expected {expected} for K={original_in_features}")
-
-    # Interpret bytes as uint8 so we can shift/mask predictably.
-    p_u8 = packed.view(torch.uint8)
-    low = (p_u8 & 0x0F).to(torch.int16)
-    high = ((p_u8 >> 4) & 0x0F).to(torch.int16)
-
-    # Convert unsigned nibble [0..15] to signed int4 [-8..7]
-    low_s = torch.where(low >= 8, low - 16, low)
-    high_s = torch.where(high >= 8, high - 16, high)
-
-    # Interleave low/high along K
-    out = torch.empty((N, packed_K * 2), device=packed.device, dtype=torch.int16)
-    out[:, 0::2] = low_s
-    out[:, 1::2] = high_s
-    out = out[:, :original_in_features].to(torch.int8)
-    return out
+from .linear_int4_w4a16 import LinearInt4W4A16Strategy
 
 
 @register_linear_strategy(weight_dtype="int4", act_dtype="int8")
@@ -148,17 +28,9 @@ def _build_linear_int4_w4a8() -> LinearQuantizationStrategy:
 
 
 class LinearInt4W4A8Strategy(LinearQuantizationStrategy):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
-        # bf16 weight id -> (packed_int8[N,ceil(K/2)], scales_bf16[N])
-        self._weight_cache: dict[int, tuple[torch.Tensor, torch.Tensor]] = {}
-        # (packed_id, K) -> unpacked_int8[N,K]
-        self._unpacked_cache: dict[tuple[int, int], torch.Tensor] = {}
-        # (packed_id, K) -> unpacked_t_int8[K,N]
-        self._unpacked_t_cache: dict[tuple[int, int], torch.Tensor] = {}
-        self._dequant_weight_cache: dict[int, torch.Tensor] = {}
-        # (device_index, M_bucket, N, K) -> TileLang config dict for fused kernel
-        self._tl_fused_cfg_cache: dict[tuple[int, int, int, int], dict[str, Any]] = {}
+        self._w4a16 = LinearInt4W4A16Strategy()
 
     @property
     def name(self) -> str:
@@ -173,71 +45,16 @@ def linear_act_format(self) -> str:
         return "int8"
 
     def get_storage_dtype(self) -> tuple[torch.dtype, int]:
-        # stored as packed int8 bytes (2 weights per byte)
         return torch.int8, 1
 
-    def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs) -> tuple[int, ...]:
-        """Return shape of scales tensor for per-channel quantization.
-        
-        For [out_features, in_features] weight, scales shape is [out_features].
-        """
-        _ = kwargs
-        if len(original_shape) < 2:
-            raise ValueError(f"Expected weight shape with at least 2 dims, got {original_shape}")
-        # Per-output-channel: scales shape is [out_features]
-        return (original_shape[0],)
+    def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs: Any) -> tuple[int, ...]:
+        return self._w4a16.get_scale_shape(original_shape, **kwargs)
 
-    def clear_cache(self) -> None:
-        self._weight_cache.clear()
-        self._unpacked_cache.clear()
-        self._unpacked_t_cache.clear()
-        self._dequant_weight_cache.clear()
-        self._tl_fused_cfg_cache.clear()
+    def quantize(self, tensor: torch.Tensor, **kwargs: Any) -> tuple[torch.Tensor, Any]:
+        return self._w4a16.quantize(tensor, **kwargs)
 
-    def quantize(self, tensor: torch.Tensor, **kwargs) -> tuple[torch.Tensor, Any]:
-        _ = kwargs
-        # Per-output-channel symmetric int4 quantization: scale = absmax/7
-        abs_max = torch.abs(tensor).max(dim=-1, keepdim=True)[0]  # [N,1]
-        # Keep scales in fp16 to reduce scale quantization error (A8 paths are sensitive).
-        scales = (abs_max.clamp(min=1e-8) / 7.0).to(torch.float16)  # [N,1]
-        q = torch.round(tensor / scales).clamp(-8, 7).to(torch.int16)  # [N,K]
-
-        # Pack two int4 into one byte: low nibble for even k, high nibble for odd k.
-        N, K = q.shape
-        packed_K = (K + 1) // 2
-        q_even = q[:, 0::2]
-        q_odd = q[:, 1::2]
-        if q_odd.shape[1] != q_even.shape[1]:
-            q_odd = torch.nn.functional.pad(q_odd, (0, 1), value=0)
-
-        q_even_u = (q_even & 0x0F).to(torch.uint8)
-        q_odd_u = (q_odd & 0x0F).to(torch.uint8)
-        packed_u8 = q_even_u | (q_odd_u << 4)  # [N, packed_K]
-        packed_i8 = packed_u8.view(torch.int8)
-        return packed_i8, scales.squeeze(-1)
-
-    def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) -> torch.Tensor:
-        original_in_features = kwargs.get("original_in_features", None)
-        if original_in_features is None:
-            raise ValueError("original_in_features is required for int4 dequantize")
-        scales = scale_or_metadata.get("scales") if isinstance(scale_or_metadata, dict) else scale_or_metadata
-        if scales is None:
-            raise ValueError("scales required for dequantization")
-        w_i8 = _unpack_int4_packed_int8(quantized, original_in_features=original_in_features)  # [N,K]
-        deq = w_i8.to(torch.float32) * scales.to(torch.float32).unsqueeze(-1)
-        return deq.to(torch.bfloat16)
-
-    def quantize_weight_for_kernel(
-        self,
-        weight: torch.Tensor,
-        *,
-        device: torch.device | None = None,
-        **kwargs: Any,
-    ) -> tuple[torch.Tensor, Any]:
-        _ = kwargs
-        if device is not None:
-            weight = weight.to(device=device)
-        return self.quantize(weight)
+    def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs: Any) -> torch.Tensor:
+        return self._w4a16.dequantize(quantized, scale_or_metadata, **kwargs)
 
     def linear_forward(
         self,
@@ -249,259 +66,12 @@ def linear_forward(
         **kwargs: Any,
     ) -> torch.Tensor:
         _ = quant_kind
-        quant_scales = kwargs.pop("quant_scales", None)
-        original_in_features = kwargs.pop("original_in_features", None)
-        if original_in_features is None:
-            raise ValueError("W4A8 requires original_in_features for packed int4 weights")
-
-        # Resolve / cache packed weight + scales
-        if weight.dtype == torch.int8:
-            if quant_scales is None:
-                raise ValueError("weight is int8 (packed int4) but quant_scales is None")
-            packed = weight if weight.device == x.device else weight.to(device=x.device)
-            w_scales = quant_scales
-            # Prefer fp16 scales for quality (and fused kernel expects fp16 scales).
-            if w_scales.dtype != torch.float16:
-                w_scales = w_scales.to(dtype=torch.float16)
-            if w_scales.device != x.device:
-                w_scales = w_scales.to(device=x.device)
-            weight_id = id(weight)
-        else:
-            weight_id = id(weight)
-            cached = self._weight_cache.get(weight_id)
-            if cached is None:
-                packed, w_scales = self.quantize_weight_for_kernel(weight, device=x.device)
-                self._weight_cache[weight_id] = (packed, w_scales)
-            else:
-                packed, w_scales = cached
-                if packed.device != x.device:
-                    packed = packed.to(device=x.device)
-                    w_scales = w_scales.to(device=x.device)
-                    self._weight_cache[weight_id] = (packed, w_scales)
-
-        # Optional: dequant once and use cuBLAS BF16
-        if os.getenv("DIFFULEX_W4A8_PREFER_CUBLAS", "0") == "1":
-            deq_key = weight_id
-            deq_w = self._dequant_weight_cache.get(deq_key)
-            if deq_w is None or deq_w.device != x.device:
-                deq_w = self.dequantize(packed, w_scales, original_in_features=original_in_features)
-                self._dequant_weight_cache[deq_key] = deq_w
-            return F.linear(x, deq_w, bias)
-
-        # Quantize activation per-row to int8
-        if x.dtype not in (torch.bfloat16, torch.float16, torch.float32):
-            x = x.to(torch.bfloat16)
-        if x.dtype != torch.bfloat16:
-            x = x.to(torch.bfloat16)
-
-        # Try TileLang fused quant + GEMM first (bf16 activation input).
-        use_fused = os.getenv("DIFFULEX_W4A8_USE_TL_FUSED_GEMM", "1") == "1"
-        if (
-            use_fused
-            and _TILELANG_AVAILABLE
-            and (w4a8_fused_act_gemm is not None)
-            and x.is_cuda
-            and x.dtype == torch.bfloat16
-            and x.dim() == 2
-            and x.is_contiguous()
-        ):
-            try:
-                M, K = x.shape
-                N, packed_K = packed.shape
-                expected_packed_K = (original_in_features + 1) // 2
-                assert packed_K == expected_packed_K, (
-                    f"Packed K mismatch: got {packed_K}, expected {expected_packed_K} for K={original_in_features}"
-                )
-
-                # Reduce TileLang JIT compilation churn using M-bucketing (similar to W8A16)
-                M_bucket = M
-                if M > 1:
-                    if M <= 64:
-                        M_bucket = 1 << (M - 1).bit_length()
-                    else:
-                        M_bucket = ((M + 63) // 64) * 64
-
-                x_for_kernel = x
-                if M_bucket != M:
-                    x_pad = torch.zeros((M_bucket, K), device=x.device, dtype=torch.bfloat16)
-                    x_pad[:M, :] = x
-                    x_for_kernel = x_pad
-
-                dev_idx = x.device.index or 0
-                cfg_key = (dev_idx, M_bucket, N, original_in_features)
-                cfg = self._tl_fused_cfg_cache.get(cfg_key)
-                kernel = None
-
-                # TileLang autotune (warmup-only): we set real inputs so the autotuner can benchmark configs.
-                if cfg is None and is_warming_up() and set_autotune_inputs is not None:
-                    try:
-                        with set_autotune_inputs([x_for_kernel, packed, w_scales]):
-                            kernel = w4a8_fused_act_gemm(M_bucket, N, original_in_features)
-                        cfg = kernel.config
-                        self._tl_fused_cfg_cache[cfg_key] = cfg
-                    except Exception:
-                        # Cache a safe default to avoid retriggering autotune for this key.
-                        cfg = _DEFAULT_TL_LINEAR_CFG
-                        self._tl_fused_cfg_cache[cfg_key] = cfg
-
-                if cfg is None:
-                    cfg = _DEFAULT_TL_LINEAR_CFG
-                    self._tl_fused_cfg_cache[cfg_key] = cfg
-
-                if kernel is None:
-                    kernel = w4a8_fused_act_gemm(M_bucket, N, original_in_features, **cfg)
-                out_full = kernel(x_for_kernel, packed, w_scales)
-                out = out_full[:M, :] if M_bucket != M else out_full
-                if bias is not None:
-                    out = out + bias
-                return out
-            except Exception as e:
-                error_msg = str(e)
-                if len(error_msg) > 200:
-                    error_msg = error_msg[:200] + "..."
-                warnings.warn(
-                    f"W4A8 fused quant GEMM failed, falling back to quantize+GEMM: {error_msg}",
-                    UserWarning,
-                )
-
-        # Step-local cache for activation quantization (reuse within one step for QKV/gate-up, etc.)
-        use_cache = os.getenv("DIFFULEX_W4A8_ACT_QUANT_CACHE", "1") == "1"
-        cached = None
-        if use_cache:
-            try:
-                from diffulex.utils.quantization.context import get_cached_act_quant, set_cached_act_quant
-                cached = get_cached_act_quant(x)
-            except Exception:
-                cached = None
-        if cached is not None:
-            x_q, x_scales = cached
-        else:
-            x_q, x_scales = _quantize_per_row_int8(x)
-            if use_cache:
-                try:
-                    set_cached_act_quant(x, x_q, x_scales)
-                except Exception:
-                    pass
-        if x_q.device != x.device:
-            x_q = x_q.to(device=x.device)
-            x_scales = x_scales.to(device=x.device)
-
-        # Get shapes
-        M, K = x_q.shape
-        N, packed_K = packed.shape
-        expected_packed_K = (original_in_features + 1) // 2
-        assert packed_K == expected_packed_K, f"Packed K mismatch: got {packed_K}, expected {expected_packed_K} for K={original_in_features}"
-
-        # Try TileLang kernel first if available (uses packed weights directly)
-        if _TILELANG_AVAILABLE and (w4a8_scaled_gemm is not None or w4a8_gemm is not None):
-            try:
-                # Check device
-                if x.device.type != 'cuda':
-                    # Fall through to _int8_mm fallback
-                    pass
-                else:
-                    # Reduce TileLang JIT compilation churn using M-bucketing (similar to W8A16)
-                    M_bucket = M
-                    if M > 1:
-                        if M <= 64:
-                            M_bucket = 1 << (M - 1).bit_length()
-                        else:
-                            M_bucket = ((M + 63) // 64) * 64
-
-                    x_q_for_kernel = x_q
-                    if M_bucket != M:
-                        x_pad = torch.zeros((M_bucket, K), device=x.device, dtype=torch.int8)
-                        x_pad[:M, :] = x_q
-                        x_q_for_kernel = x_pad
-                        x_scales_pad = torch.zeros((M_bucket,), device=x.device, dtype=torch.float32)
-                        x_scales_pad[:M] = x_scales.to(torch.float32)
-                        x_scales_for_kernel = x_scales_pad
-                    else:
-                        x_scales_for_kernel = x_scales.to(torch.float32)
-
-                    # Prefer fused-scale kernel: outputs bf16 directly.
-                    if w4a8_scaled_gemm is not None:
-                        kernel = w4a8_scaled_gemm(
-                            M_bucket,
-                            N,
-                            original_in_features,
-                            block_M=64,
-                            block_N=64,
-                            block_K=128,
-                            num_stages=2,
-                            threads=128,
-                        )
-                        out_full = kernel(x_q_for_kernel, packed, x_scales_for_kernel, w_scales)
-                        out = out_full[:M, :] if M_bucket != M else out_full
-                    else:
-                        # Fallback to int32-output kernel + python scaling
-                        kernel = w4a8_gemm(
-                            M_bucket,
-                            N,
-                            original_in_features,
-                            block_M=64,
-                            block_N=64,
-                            block_K=128,
-                            num_stages=2,
-                            threads=128,
-                        )
-                        out_i32_full = kernel(x_q_for_kernel, packed)
-                        out_i32 = out_i32_full[:M, :] if M_bucket != M else out_i32_full
-
-                        out_fp32 = out_i32.to(torch.float32)
-                        out_fp32 = out_fp32 * x_scales.to(torch.float32).unsqueeze(-1)
-                        out_fp32 = out_fp32 * w_scales.to(torch.float32).unsqueeze(0)
-                        out = out_fp32.to(torch.bfloat16)
-
-                    if bias is not None:
-                        out = out + bias
-                    return out
-            except Exception as e:
-                # Fallback to _int8_mm on any kernel error
-                error_msg = str(e)
-                if len(error_msg) > 200:
-                    error_msg = error_msg[:200] + "..."
-                warnings.warn(f"W4A8 TileLang kernel failed, falling back to torch._int_mm: {error_msg}", UserWarning)
-
-        # Fallback: unpack weight and use torch._int_mm
-        # Unpack weight to int8 and cache
-        packed_key = (id(packed), int(original_in_features))
-        w_i8 = self._unpacked_cache.get(packed_key)
-        if w_i8 is None or w_i8.device != x.device:
-            w_i8 = _unpack_int4_packed_int8(packed, original_in_features=original_in_features)
-            self._unpacked_cache[packed_key] = w_i8
-
-        wt = self._unpacked_t_cache.get(packed_key)
-        if wt is None or wt.device != x.device:
-            wt = w_i8.t().contiguous()
-            self._unpacked_t_cache[packed_key] = wt
-
-        # Pad small M for backend constraints (M > 16)
-        if M <= 16:
-            M_bucket = 17
-            x_pad = torch.zeros((M_bucket, K), device=x.device, dtype=torch.int8)
-            x_pad[:M, :] = x_q
-            x_q_for_mm = x_pad
-        else:
-            x_q_for_mm = x_q
-
-        try:
-            out_i32_full = _int8_mm(x_q_for_mm, wt)
-        except Exception as e:
-            msg = str(e)
-            if len(msg) > 200:
-                msg = msg[:200] + "..."
-            warnings.warn(f"W4A8 int8 GEMM failed, falling back to BF16 F.linear: {msg}", UserWarning)
-            deq_w = self.dequantize(packed, w_scales, original_in_features=original_in_features)
-            return F.linear(x, deq_w, bias)
-
-        out_i32 = out_i32_full[:M, :] if M <= 16 else out_i32_full
-        out_fp32 = out_i32.to(torch.float32)
-        out_fp32 = out_fp32 * x_scales.to(torch.float32).unsqueeze(-1)
-        out_fp32 = out_fp32 * w_scales.to(torch.float32).unsqueeze(0)
-        out = out_fp32.to(torch.bfloat16)
-        if bias is not None:
-            out = out + bias
-        return out
-
+        if not bool(int(__import__("os").environ.get("DIFFULEX_ALLOW_SLOW_QUANT_FALLBACK", "0"))):
+            raise RuntimeError(
+                "当前平台/配置下 `int4` 在线量化没有可用的 vLLM 快 kernel（例如 4090/sm89 无 CUTLASS W4A8）。"
+                "为避免静默退化到 bf16 GEMM，已禁止 `F.linear` 慢 fallback。"
+                "请改用 `gptq/awq`（vLLM 标准打包格式）或设置 DIFFULEX_ALLOW_SLOW_QUANT_FALLBACK=1 临时开启。"
+            )
+        # Correctness-first: reuse W4A16 implementation.
+        return self._w4a16.linear_forward(x, weight, bias, quant_kind="other", **kwargs)
 
diff --git a/diffulex/utils/quantization/strategies/linear_int8_w8a16.py b/diffulex/utils/quantization/strategies/linear_int8_w8a16.py
index d3e4db9..67ab104 100644
--- a/diffulex/utils/quantization/strategies/linear_int8_w8a16.py
+++ b/diffulex/utils/quantization/strategies/linear_int8_w8a16.py
@@ -1,546 +1,29 @@
 """
 W8A16 Linear quantization strategy (int8 weight + bf16 activation).
 
-Reference implementation using Python dequantization + torch.nn.functional.linear.
-Future optimizations:
-- Lazy cache quantized weights per module instance
-- Replace F.linear with custom Triton/TileLang kernel for int8 GEMM
+This path is now implemented by reusing Diffulex's marlin(AllSpark)-style W8A16
+strategy, which matches vLLM's effective fast path and avoids TileLang.
 """
 
 from __future__ import annotations
 
-from typing import Any, Optional
-
-import os
-import torch
-import torch.nn.functional as F
-
 from diffulex.utils.quantization.registry import register_linear_strategy
 from diffulex.utils.quantization.strategy import LinearQuantizationStrategy
 
-# Try to import TileLang kernel, fallback to None if not available
-try:
-    from diffulex_kernel.python.linear_kernels import w8a16_gemm
-    _TILELANG_AVAILABLE = True
-except ImportError:
-    _TILELANG_AVAILABLE = False
-    w8a16_gemm = None
+from .linear_marlin_int8_w8a16 import LinearMarlinInt8W8A16Strategy
 
-try:
-    from diffulex_kernel.python.linear_kernels import w8a16_gemm_bias
-except ImportError:
-    w8a16_gemm_bias = None
 
-try:
-    from diffulex.attention.metadata import is_warming_up
-    from tilelang.autotuner import set_autotune_inputs
-    _AUTOTUNE_AVAILABLE = True
-except ImportError:
-    _AUTOTUNE_AVAILABLE = False
-    is_warming_up = lambda: False
-    set_autotune_inputs = lambda *args, **kwargs: lambda f: f
+class LinearInt8W8A16Strategy(LinearMarlinInt8W8A16Strategy):
+    """
+    Compatibility alias for the historical Diffulex strategy name.
+
+    This keeps the registry and `strategies.__init__` imports stable while
+    reusing the vLLM-aligned marlin(AllSpark) W8A16 implementation.
+    """
 
 
 @register_linear_strategy(weight_dtype="int8", act_dtype="bf16")
 def _build_linear_int8_w8a16() -> LinearQuantizationStrategy:
+    # Alias to marlin(AllSpark) W8A16 implementation.
     return LinearInt8W8A16Strategy()
 
-
-class LinearInt8W8A16Strategy(LinearQuantizationStrategy):
-    """W8A16 Linear strategy: int8 weight quantization + bf16 activation.
-
-    Current implementation: Python reference using dequantized weights + F.linear.
-    Weight quantization: per-output-channel symmetric quantization to int8.
-    Activation: kept as bf16 (no activation quantization).
-    
-    Lazy cache: Quantized weights are cached per weight tensor (by id) to avoid
-    re-quantizing on every forward pass.
-    """
-    
-    def __init__(self):
-        """Initialize strategy with empty weight cache."""
-        super().__init__()
-        # Cache: weight_id -> (quantized_weight, scales)
-        # Using id(weight) as key since the same Parameter object is reused across forwards
-        self._weight_cache: dict[int, tuple[torch.Tensor, torch.Tensor]] = {}
-        # Optional cache: weight_id -> bf16 dequantized weight (speed-first; uses extra memory)
-        self._dequant_weight_cache: dict[int, torch.Tensor] = {}
-        # bias cache for fused-bias kernel (store fp16 copy on device)
-        self._bias_f16_cache: dict[int, torch.Tensor] = {}
-        # TileLang autotune config cache: (device, M_bucket, N, K) -> config dict
-        self._tl_autotune_config_cache: dict[tuple[str, int, int, int], dict] = {}
-        # Lightweight runtime observability (opt-in by env var)
-        self._rt_call_count: int = 0
-        self._rt_fallback_count: int = 0
-        self._rt_m_hist_le64: dict[int, int] = {}
-
-    def _rt_enabled(self) -> bool:
-        return os.getenv("DIFFULEX_LINEAR_PROFILE", "0") == "1"
-
-    def _rt_log_every(self) -> int:
-        try:
-            return int(os.getenv("DIFFULEX_LINEAR_PROFILE_EVERY", "200"))
-        except Exception:
-            return 200
-
-    def _rt_on_call(self, *, m: int, n: int, k: int) -> None:
-        if not self._rt_enabled():
-            return
-        self._rt_call_count += 1
-        if m <= 64:
-            self._rt_m_hist_le64[m] = self._rt_m_hist_le64.get(m, 0) + 1
-        every = self._rt_log_every()
-        if every > 0 and (self._rt_call_count % every == 0):
-            top = sorted(self._rt_m_hist_le64.items(), key=lambda kv: (-kv[1], kv[0]))[:8]
-            top_str = ", ".join([f"M={mm}:{cc}" for mm, cc in top]) if top else "empty"
-            print(
-                f"[DIFFULEX_LINEAR_PROFILE][w8a16] calls={self._rt_call_count} "
-                f"fallbacks={self._rt_fallback_count} last(M,N,K)=({m},{n},{k}) "
-                f"M_hist_le64_top={top_str}",
-                flush=True,
-            )
-
-    def _rt_on_fallback(self, *, m: int, n: int, k: int, reason: str) -> None:
-        if not self._rt_enabled():
-            return
-        self._rt_fallback_count += 1
-        # Avoid spam: only print first few fallbacks, then rely on periodic summary.
-        max_print = 5
-        try:
-            max_print = int(os.getenv("DIFFULEX_LINEAR_FALLBACK_MAX_PRINT", "5"))
-        except Exception:
-            pass
-        if self._rt_fallback_count <= max_print:
-            print(
-                f"[DIFFULEX_LINEAR_PROFILE][w8a16][FALLBACK] "
-                f"count={self._rt_fallback_count} (M,N,K)=({m},{n},{k}) reason={reason}",
-                flush=True,
-            )
-
-    @property
-    def name(self) -> str:
-        return "linear_int8_w8a16"
-
-    @property
-    def linear_weight_format(self) -> str:
-        return "int8"
-
-    @property
-    def linear_act_format(self) -> str:
-        return "bf16"
-
-    def get_storage_dtype(self) -> tuple[torch.dtype, int]:
-        # Weights are stored as int8 (1 byte per element)
-        return torch.int8, 1
-
-    def quantize(self, tensor: torch.Tensor, **kwargs) -> tuple[torch.Tensor, Any]:
-        """Quantize tensor to int8 with per-channel (per-output) scales.
-        
-        Args:
-            tensor: Weight tensor of shape [out_features, in_features]
-            **kwargs: Additional arguments (unused for now)
-        
-        Returns:
-            (quantized_tensor, scales): quantized_tensor is int8, scales is [out_features]
-        """
-        _ = kwargs
-        # Per-output-channel quantization: compute scale for each output channel
-        # shape: [out_features, in_features] -> scales shape: [out_features]
-        abs_max = torch.abs(tensor).max(dim=-1, keepdim=True)[0]  # [out_features, 1]
-        # Avoid division by zero
-        scales = abs_max.clamp(min=1e-8) / 127.0  # [out_features, 1]
-        
-        # Quantize: round(clamp(tensor / scales, -128, 127))
-        quantized = torch.round(tensor / scales).clamp(-128, 127).to(torch.int8)
-        scales_1d = scales.squeeze(-1)  # [out_features]
-        
-        return quantized, scales_1d
-
-    def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) -> torch.Tensor:
-        """Dequantize int8 tensor back to bf16 using per-channel scales.
-        
-        Args:
-            quantized: int8 tensor [out_features, in_features]
-            scale_or_metadata: scales tensor [out_features] or dict with 'scales'
-            **kwargs: Additional arguments (unused for now)
-        
-        Returns:
-            Dequantized tensor in bf16
-        """
-        _ = kwargs
-        if isinstance(scale_or_metadata, dict):
-            scales = scale_or_metadata.get("scales")
-        else:
-            scales = scale_or_metadata
-        
-        if scales is None:
-            raise ValueError("scales required for dequantization")
-        
-        # Ensure scales have correct shape for broadcasting
-        if scales.dim() == 1:
-            scales = scales.unsqueeze(-1)  # [out_features, 1]
-        
-        # Dequantize: quantized * scales
-        dequantized = quantized.to(torch.float32) * scales
-        return dequantized.to(torch.bfloat16)
-
-    def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs) -> tuple[int, ...]:
-        """Return shape of scales tensor for per-channel quantization.
-        
-        For [out_features, in_features] weight, scales shape is [out_features].
-        """
-        _ = kwargs
-        if len(original_shape) < 2:
-            raise ValueError(f"Expected weight shape with at least 2 dims, got {original_shape}")
-        # Per-output-channel: scales shape is [out_features]
-        return (original_shape[0],)
-
-    def quantize_weight_for_kernel(
-        self,
-        weight: torch.Tensor,
-        *,
-        device: torch.device | None = None,
-        **kwargs: Any,
-    ) -> tuple[torch.Tensor, Any]:
-        """Quantize weight to int8 with per-channel scales.
-        
-        Returns:
-            (quantized_weight, scales): quantized_weight is int8 [out, in], scales is [out]
-        """
-        _ = kwargs
-        if device is not None:
-            weight = weight.to(device=device)
-        
-        quantized, scales = self.quantize(weight)
-        return quantized, scales
-
-    def quantize_act_for_kernel(
-        self,
-        x: torch.Tensor,
-        *,
-        device: torch.device | None = None,
-        **kwargs: Any,
-    ) -> tuple[torch.Tensor, Any]:
-        """No activation quantization for W8A16 (activation stays bf16)."""
-        if device is not None:
-            x = x.to(device=device)
-        return x, None
-
-    def linear_forward(
-        self,
-        x: torch.Tensor,
-        weight: torch.Tensor,
-        bias: Optional[torch.Tensor],
-        *,
-        quant_kind: str,
-        **kwargs: Any,
-    ) -> torch.Tensor:
-        """Compute Linear output using quantized weights (W8A16).
-        
-        Uses TileLang kernel if available and conditions are met, otherwise falls back
-        to Python reference implementation (dequant + F.linear).
-        
-        Conditions for using TileLang kernel:
-        - TileLang is available
-        - Device is CUDA
-        - (Kernel supports tail sizes; no K%128 constraint required)
-        """
-        _ = quant_kind
-
-        # If caller provides a pre-quantized int8 weight + scales (e.g., load-time quantized module),
-        # use them directly and DO NOT populate the lazy cache (to avoid double-storage).
-        quant_scales = kwargs.pop("quant_scales", None)
-        if weight.dtype == torch.int8:
-            if quant_scales is None:
-                raise ValueError("weight is int8 but quant_scales is None; expected per-channel scales tensor")
-            quantized_weight = weight
-            scales = quant_scales
-            if scales.dtype != torch.bfloat16:
-                scales = scales.to(dtype=torch.bfloat16)
-            if quantized_weight.device != x.device:
-                quantized_weight = quantized_weight.to(device=x.device)
-            if scales.device != x.device:
-                scales = scales.to(device=x.device)
-        else:
-            # Lazy cache: use weight tensor id as key (only for bf16/fp16 weights)
-            weight_id = id(weight)
-
-            # Check cache
-            if weight_id in self._weight_cache:
-                quantized_weight, scales = self._weight_cache[weight_id]
-                # Ensure cached tensors are on the correct device
-                if quantized_weight.device != x.device:
-                    quantized_weight = quantized_weight.to(device=x.device)
-                    scales = scales.to(device=x.device)
-            else:
-                # Quantize weight and cache it
-                quantized_weight, scales = self.quantize_weight_for_kernel(weight, device=x.device)
-                # Cache the quantized weight and scales
-                self._weight_cache[weight_id] = (quantized_weight, scales)
-        
-        # Speed-first option:
-        # Using the TileLang kernel can be slower than cuBLAS BF16 GEMM for small/typical decode shapes.
-        # If enabled, we dequantize once and reuse a cached bf16 weight for F.linear (cuBLAS).
-        # This trades extra GPU memory for throughput.
-        if os.getenv("DIFFULEX_W8A16_PREFER_CUBLAS", "0") == "1":
-            # Key by the actual weight object we received (bf16 Parameter or int8 buffer).
-            deq_key = id(weight)
-            deq_w = self._dequant_weight_cache.get(deq_key)
-            if deq_w is None or deq_w.device != x.device:
-                # Dequantize: int8[N,K] * scales[N] -> bf16[N,K]
-                s = scales
-                if s.dim() == 1:
-                    s = s.unsqueeze(-1)
-                deq_w = (quantized_weight.to(torch.float32) * s.to(torch.float32)).to(torch.bfloat16)
-                self._dequant_weight_cache[deq_key] = deq_w
-            return F.linear(x, deq_w, bias)
-        
-        # Try to use TileLang kernel if available
-        if _TILELANG_AVAILABLE and w8a16_gemm is not None:
-            try:
-                # Check device
-                if x.device.type != 'cuda':
-                    return self._fallback_python_forward(x, quantized_weight, scales, bias)
-                
-                # Check CUDA compute capability (skip kernel if unsupported)
-                # sm_89 (Hopper) requires CUDA 11.8+, sm_90+ requires CUDA 12.0+
-                # If CUDA toolkit doesn't support the GPU architecture, skip kernel attempt
-                try:
-                    if torch.cuda.is_available():
-                        props = torch.cuda.get_device_properties(x.device.index or 0)
-                        compute_cap = (props.major, props.minor)
-                        # sm_89 requires CUDA 11.8+, sm_90+ requires CUDA 12.0+
-                        # For now, we'll let TileLang handle the check and fallback gracefully
-                        # This is a conservative approach - we try the kernel and let it fail gracefully
-                        pass
-                except Exception:
-                    # If we can't check compute capability, still try the kernel
-                    pass
-                
-                # Get shapes
-                M, K = x.shape
-                N, K_w = quantized_weight.shape
-                assert K == K_w, f"K dimension mismatch: {K} != {K_w}"
-                self._rt_on_call(m=M, n=N, k=K)
-                
-                # Reduce TileLang JIT compilation churn without killing small-M decode performance.
-                # Previous logic padded *any* M!=1 to 64/128/256, which can turn decode M=2/4 into M=64.
-                # We instead bucket to a small stable set:
-                # - for M<=64: next power-of-two (2,4,8,16,32,64)
-                # - for M>64: round up to a multiple of 64
-                M_bucket = M
-                if M > 1:
-                    if M <= 64:
-                        M_bucket = 1 << (M - 1).bit_length()
-                    else:
-                        M_bucket = ((M + 63) // 64) * 64
-                else:
-                    M_bucket = 1
-
-                # TileLang MMA GEMM requires M divisible by 16.
-                # For decode small-M (1/2/4/8), pad minimally to 16 (much cheaper than padding to 64).
-                if M_bucket < 16:
-                    M_bucket = 16
-
-                x_for_kernel = x
-                if M_bucket != M:
-                    x_pad = torch.zeros((M_bucket, K), device=x.device, dtype=x.dtype)
-                    x_pad[:M, :] = x
-                    x_for_kernel = x_pad
-
-                # Choose a small-M friendly block_M to reduce wasted work in decode.
-                # Keep variants bounded to avoid compilation churn and satisfy MMA constraints:
-                # use only {16, 32, 64} so M is always divisible by 16.
-                if M_bucket <= 16:
-                    block_m = 16
-                elif M_bucket <= 32:
-                    block_m = 32
-                else:
-                    block_m = 64
-
-                # TileLang autotune: use warmup + config cache pattern
-                # NOTE: fused-bias kernel currently regresses decode throughput significantly on typical workloads.
-                # Keep it disabled by default; can be enabled for experimentation.
-                fuse_bias = os.getenv("DIFFULEX_W8A16_FUSE_BIAS", "0") == "1"
-                use_bias_kernel = fuse_bias and (bias is not None) and (w8a16_gemm_bias is not None)
-                
-                cache_key = (str(x.device), M_bucket, N, K)
-                config = self._tl_autotune_config_cache.get(cache_key)
-                
-                if _AUTOTUNE_AVAILABLE and is_warming_up() and config is None:
-                    # Warmup phase: run autotune with real inputs
-                    try:
-                        if use_bias_kernel:
-                            b_key = id(bias)
-                            b = self._bias_f16_cache.get(b_key)
-                            if b is None or b.device != x.device:
-                                b = bias.to(device=x.device, dtype=torch.float16)
-                                self._bias_f16_cache[b_key] = b
-                            with set_autotune_inputs([x_for_kernel, quantized_weight, scales, b]):
-                                kernel = w8a16_gemm_bias(M_bucket, N, K)
-                        else:
-                            with set_autotune_inputs([x_for_kernel, quantized_weight, scales]):
-                                kernel = w8a16_gemm(M_bucket, N, K)
-                        config = kernel.config
-                        self._tl_autotune_config_cache[cache_key] = config
-                    except Exception:
-                        # Fallback to default config if autotune fails
-                        config = None
-                
-                # Use cached config or default parameters
-                if config is not None:
-                    if use_bias_kernel:
-                        kernel = w8a16_gemm_bias(M_bucket, N, K, **config)
-                    else:
-                        kernel = w8a16_gemm(M_bucket, N, K, **config)
-                else:
-                    # Default config (backward compatible)
-                    if use_bias_kernel:
-                        kernel = w8a16_gemm_bias(
-                            M_bucket,
-                            N,
-                            K,
-                            block_M=block_m,
-                            block_N=64,
-                            block_K=128,
-                            num_stages=2,
-                            threads=128,
-                        )
-                    else:
-                        kernel = w8a16_gemm(
-                            M_bucket,
-                            N,
-                            K,
-                            block_M=block_m,
-                            block_N=64,
-                            block_K=128,
-                            num_stages=2,
-                            threads=128,
-                        )
-                
-                # Call kernel - out_idx=[3] means output is the 4th parameter,
-                # so we only pass inputs (x, quantized_weight, scales), and kernel returns output
-                tag_kernel = os.getenv("DIFFULEX_PROFILE_TAG_W8A16", "0") == "1"
-                tag_name = (
-                    f"{'w8a16_gemm_bias' if use_bias_kernel else 'w8a16_gemm'}"
-                    f"[M={M} Mb={M_bucket} N={N} K={K} bm={block_m} bn=64 bk=128 st=2 th=128]"
-                )
-                if use_bias_kernel:
-                    # out_idx=[4] -> output is 5th arg (returned). Inputs: A, B, Scales, Bias
-                    # NOTE: kernel expects fp16 bias (see kernel signature).
-                    b_key = id(bias)
-                    b = self._bias_f16_cache.get(b_key)
-                    if b is None or b.device != x.device:
-                        b = bias.to(device=x.device, dtype=torch.float16)
-                        self._bias_f16_cache[b_key] = b
-                    if tag_kernel:
-                        with torch.profiler.record_function(tag_name):
-                            output_full = kernel(x_for_kernel, quantized_weight, scales, b)
-                    else:
-                        output_full = kernel(x_for_kernel, quantized_weight, scales, b)
-                else:
-                    if tag_kernel:
-                        with torch.profiler.record_function(tag_name):
-                            output_full = kernel(x_for_kernel, quantized_weight, scales)
-                    else:
-                        output_full = kernel(x_for_kernel, quantized_weight, scales)
-                output = output_full[:M, :] if M_bucket != M else output_full
-                
-                # Add bias if present
-                if (bias is not None) and (not use_bias_kernel):
-                    output = output + bias
-                
-                return output
-            except Exception as e:
-                # Fallback to Python implementation on any error
-                # This includes kernel compilation errors, execution errors, etc.
-                import warnings
-                error_msg = str(e)
-                
-                # Extract meaningful error information
-                # Check for common error types
-                if 'sm_' in error_msg and ('not defined' in error_msg or 'fatal' in error_msg):
-                    # CUDA architecture not supported
-                    import re
-                    arch_match = re.search(r"sm_(\d+)", error_msg)
-                    if arch_match:
-                        arch = arch_match.group(1)
-                        error_msg = f"CUDA architecture sm_{arch} not supported by current CUDA toolkit"
-                    else:
-                        error_msg = "CUDA architecture not supported by current CUDA toolkit"
-                elif 'Compilation error' in error_msg:
-                    # Extract the actual error after "Compilation error:"
-                    idx = error_msg.find('Compilation error')
-                    after = error_msg[idx + len('Compilation error'):]
-                    # Find the first meaningful error line
-                    lines = after.split('\n')
-                    for line in lines:
-                        line = line.strip()
-                        if line and not line.startswith('#') and ('error:' in line.lower() or 'fatal' in line.lower()):
-                            error_msg = f"CUDA compilation error: {line[:200]}"
-                            break
-                    else:
-                        error_msg = "CUDA compilation error (see logs for details)"
-                elif 'pipeline' in error_msg.lower() and 'stage' in error_msg.lower():
-                    # Pipeline stages mismatch
-                    import re
-                    match = re.search(r'Got (\d+) stages and (\d+) pipeline stages', error_msg)
-                    if match:
-                        error_msg = f"Pipeline stages mismatch: detected {match.group(1)} stages, expected {match.group(2)}"
-                    else:
-                        error_msg = "Pipeline stages configuration error"
-                else:
-                    # Truncate very long error messages (like CUDA source code)
-                    if len(error_msg) > 200:
-                        error_msg = error_msg[:200] + "..."
-                
-                # Only warn for unexpected errors
-                # For known issues (like unsupported CUDA architecture), silently fallback
-                # This prevents spam warnings when the environment doesn't support the kernel
-                if 'CUDA architecture not supported' in error_msg or 'sm_' in error_msg:
-                    # Silently fallback for unsupported architectures (expected in some environments)
-                    # The Python fallback is fully functional, so this is acceptable
-                    pass
-                elif 'Pipeline stages' in error_msg:
-                    # Pipeline stages mismatch - this might be fixable, but for now silently fallback
-                    pass
-                else:
-                    # Warn for unexpected errors that might indicate a real problem
-                    warnings.warn(
-                        f"TileLang kernel failed, falling back to Python implementation: {error_msg}",
-                        UserWarning,
-                    )
-                # Count fallback and expose reason (opt-in).
-                try:
-                    m, k = x.shape
-                    n = int(quantized_weight.shape[0])
-                except Exception:
-                    m, n, k = -1, -1, -1
-                self._rt_on_fallback(m=m, n=n, k=k, reason=error_msg)
-                return self._fallback_python_forward(x, quantized_weight, scales, bias)
-        else:
-            # TileLang not available, use Python reference
-            return self._fallback_python_forward(x, quantized_weight, scales, bias)
-    
-    def _fallback_python_forward(
-        self,
-        x: torch.Tensor,
-        quantized_weight: torch.Tensor,
-        scales: torch.Tensor,
-        bias: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        """Fallback Python implementation: dequantize + F.linear."""
-        # Dequantize for reference implementation
-        dequantized_weight = self.dequantize(quantized_weight, scales)
-        
-        # Compute linear output
-        return F.linear(x, dequantized_weight, bias)
-    
-    def clear_cache(self) -> None:
-        """Clear the weight quantization cache.
-        
-        Useful for memory management or when weights are updated (e.g., fine-tuning).
-        """
-        self._weight_cache.clear()
-        self._dequant_weight_cache.clear()
-
diff --git a/diffulex/utils/quantization/strategies/linear_int8_w8a8.py b/diffulex/utils/quantization/strategies/linear_int8_w8a8.py
index f677e11..52e92ed 100644
--- a/diffulex/utils/quantization/strategies/linear_int8_w8a8.py
+++ b/diffulex/utils/quantization/strategies/linear_int8_w8a8.py
@@ -1,125 +1,35 @@
 """
-W8A8 Linear quantization strategy (int8 weight + int8 activation).
+W8A8 Linear quantization strategy (int8 weight + int8 activation), TileLang-free.
 
-Implementation notes:
-- We keep per-output-channel weight scales (same as W8A16).
-- We quantize activations per-row (per token) to int8 and keep per-row scales.
-- GEMM uses `torch._int_mm` (int8 x int8 -> int32) when available.
-  This op has a small-M constraint on some builds (e.g. M must be > 16), so we pad M minimally.
-- If int8 GEMM is not available, we fall back to dequantized BF16 + cuBLAS (F.linear).
+Implementation (vLLM-aligned):
+- Activation quantization: `vllm._custom_ops.scaled_int8_quant` (dynamic per-token).
+- GEMM+dequant: `vllm._custom_ops.cutlass_scaled_mm` (CUTLASS, with internal
+  triton fallback depending on shape/platform) — no `F.linear` slow path.
+
+Notes:
+- Weight is stored as int8 in **K×N** layout (transposed), matching vLLM CUTLASS
+  kernels.
+- Weight scale is stored as **[1, N]** float32 for broadcasting.
 """
 
 from __future__ import annotations
 
 from typing import Any, Optional
 
-import os
-import warnings
-
-import torch
-import torch.nn.functional as F
+import torch  # type: ignore
 
-from diffulex.attention.metadata import is_warming_up
 from diffulex.utils.quantization.registry import register_linear_strategy
 from diffulex.utils.quantization.strategy import LinearQuantizationStrategy
 
-try:
-    from diffulex_kernel.python.linear_kernels import (
-        w8a8_gemm,
-        w8a8_scaled_gemm,
-        w8a8_act_quant,
-        w8a8_fused_act_gemm,
-    )
-    _TILELANG_AVAILABLE = True
-except ImportError:
-    _TILELANG_AVAILABLE = False
-    w8a8_gemm = None
-    w8a8_scaled_gemm = None
-    w8a8_act_quant = None
-    w8a8_fused_act_gemm = None
-
-try:
-    # Optional: only needed for TileLang autotune warmup.
-    from tilelang.autotuner import set_autotune_inputs  # type: ignore
-except Exception:
-    set_autotune_inputs = None
-
-
-_DEFAULT_TL_LINEAR_CFG: dict[str, Any] = {
-    "block_M": 64,
-    "block_N": 64,
-    "block_K": 128,
-    "num_stages": 2,
-    "threads": 128,
-}
-
-
-def _quantize_per_row_int8_torch(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-    """Per-row symmetric int8 quantization.
-
-    Returns:
-        x_q: int8 [M, K]
-        x_scales: float32 [M] where dequant is x_q.float() * x_scales[:, None]
-    """
-    # x: [M, K]
-    abs_max = x.abs().amax(dim=-1, keepdim=False)  # [M]
-    scales = (abs_max.clamp(min=1e-8) / 127.0).to(torch.float32)  # [M]
-    x_q = torch.round(x.to(torch.float32) / scales.unsqueeze(-1)).clamp(-127, 127).to(torch.int8)
-    return x_q, scales
-
-
-def _quantize_per_row_int8(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-    """Per-row symmetric int8 quantization with optional TileLang fused kernel.
-
-    Default: use TileLang fused kernel if available, otherwise fall back to torch ops.
-
-    Env:
-        - DIFFULEX_W8A8_USE_TL_ACT_QUANT=0 to force torch fallback.
-    """
-    use_tl = os.getenv("DIFFULEX_W8A8_USE_TL_ACT_QUANT", "1") == "1"
-    if (
-        use_tl
-        and _TILELANG_AVAILABLE
-        and (w8a8_act_quant is not None)
-        and x.is_cuda
-        and x.dtype == torch.bfloat16
-        and x.is_contiguous()
-        and x.dim() == 2
-    ):
-        m, k = x.shape
-        # Choose a small set of block_M values to reduce wasted work on decode small-M.
-        if m <= 16:
-            block_m = 16
-        elif m <= 32:
-            block_m = 32
-        else:
-            block_m = 64
-        try:
-            kernel = w8a8_act_quant(
-                m,
-                k,
-                block_M=block_m,
-                block_K=256,
-                threads=128,
-            )
-            x_q, scales = kernel(x)
-            return x_q, scales
-        except Exception:
-            # Fall back silently to torch path for robustness (e.g., unsupported arch/toolchain).
-            pass
-    return _quantize_per_row_int8_torch(x)
-
 
-def _int8_mm(a_int8: torch.Tensor, b_int8: torch.Tensor) -> torch.Tensor:
-    """int8 GEMM -> int32.
-
-    We prefer `torch._int_mm` when present.
-    """
-    if hasattr(torch, "_int_mm"):
-        return torch._int_mm(a_int8, b_int8)
-    if hasattr(torch.ops.aten, "_int_mm"):
-        return torch.ops.aten._int_mm(a_int8, b_int8)
-    raise RuntimeError("No int8 GEMM backend found (torch._int_mm / aten._int_mm missing)")
+def _require_vllm_ops():
+    try:
+        from vllm import _custom_ops as ops  # type: ignore
+    except Exception as e:  # pragma: no cover
+        raise RuntimeError(
+            "W8A8 需要 vLLM 的 CUDA 自定义算子（vllm._custom_ops）。"
+        ) from e
+    return ops
 
 
 @register_linear_strategy(weight_dtype="int8", act_dtype="int8")
@@ -128,18 +38,10 @@ def _build_linear_int8_w8a8() -> LinearQuantizationStrategy:
 
 
 class LinearInt8W8A8Strategy(LinearQuantizationStrategy):
-    """W8A8 Linear strategy: int8 weight + int8 activation, output bf16."""
-
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
-        # weight_id -> (qweight_int8[N,K], scales_bf16[N])
+        # Cache: id(weight) -> (qweight_int8 [N,K], w_scales_fp32 [N])
         self._weight_cache: dict[int, tuple[torch.Tensor, torch.Tensor]] = {}
-        # weight_id -> qweight_t_int8[K,N] (for torch._int_mm)
-        self._weight_t_cache: dict[int, torch.Tensor] = {}
-        # speed-first option (uses extra memory)
-        self._dequant_weight_cache: dict[int, torch.Tensor] = {}
-        # (device_index, M_bucket, N, K) -> TileLang config dict for fused kernel
-        self._tl_fused_cfg_cache: dict[tuple[int, int, int, int], dict[str, Any]] = {}
 
     @property
     def name(self) -> str:
@@ -156,52 +58,49 @@ def linear_act_format(self) -> str:
     def get_storage_dtype(self) -> tuple[torch.dtype, int]:
         return torch.int8, 1
 
-    def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs) -> tuple[int, ...]:
-        """Return shape of scales tensor for per-channel quantization.
-        
-        For [out_features, in_features] weight, scales shape is [out_features].
-        """
+    def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs: Any) -> tuple[int, ...]:
         _ = kwargs
-        if len(original_shape) < 2:
-            raise ValueError(f"Expected weight shape with at least 2 dims, got {original_shape}")
-        # Per-output-channel: scales shape is [out_features]
+        if len(original_shape) != 2:
+            raise ValueError(f"Expected 2D weight [N,K], got {original_shape}")
         return (original_shape[0],)
 
-    def clear_cache(self) -> None:
-        self._weight_cache.clear()
-        self._weight_t_cache.clear()
-        self._dequant_weight_cache.clear()
-        self._tl_fused_cfg_cache.clear()
-
-    def quantize(self, tensor: torch.Tensor, **kwargs) -> tuple[torch.Tensor, Any]:
+    def quantize(self, tensor: torch.Tensor, **kwargs: Any) -> tuple[torch.Tensor, Any]:
         _ = kwargs
-        # Per-output-channel symmetric quantization: scales shape [N]
-        abs_max = torch.abs(tensor).max(dim=-1, keepdim=True)[0]  # [N, 1]
-        # Keep scales in fp16 to reduce scale quantization error (A8 paths are sensitive).
-        scales = (abs_max.clamp(min=1e-8) / 127.0).to(torch.float16)  # [N, 1]
-        q = torch.round(tensor / scales).clamp(-128, 127).to(torch.int8)
-        return q, scales.squeeze(-1)
-
-    def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs) -> torch.Tensor:
-        _ = kwargs
-        scales = scale_or_metadata.get("scales") if isinstance(scale_or_metadata, dict) else scale_or_metadata
-        if scales is None:
-            raise ValueError("scales required for dequantization")
-        if scales.dim() == 1:
-            scales = scales.unsqueeze(-1)  # [N, 1]
-        return (quantized.to(torch.float32) * scales.to(torch.float32)).to(torch.bfloat16)
+        if tensor.dim() != 2:
+            raise ValueError(f"Expected 2D weight [N,K], got shape={tuple(tensor.shape)}")
+        # per-output-channel symmetric int8, store K×N for cutlass_scaled_mm
+        w = tensor.to(torch.float32)
+        abs_max = w.abs().amax(dim=-1, keepdim=False)  # [N]
+        scales = (abs_max.clamp(min=1e-8) / 127.0).to(torch.float32)  # [N]
+        q_nk = torch.round(w / scales.unsqueeze(-1)).clamp(-127, 127).to(torch.int8)  # [N,K]
+        # NOTE: vLLM CUTLASS scaled_mm expects b.stride(0) == 1, which is true
+        # for a transpose-view (non-contiguous) but not for a contiguous K×N tensor.
+        q_kn = q_nk.t()  # [K,N], stride(0)==1
+        scale_b = scales.unsqueeze(0).contiguous()  # [1,N]
+        return q_kn, {"scales": scale_b}
 
     def quantize_weight_for_kernel(
         self,
         weight: torch.Tensor,
         *,
         device: torch.device | None = None,
-        **kwargs: Any,
+        **_: Any,
     ) -> tuple[torch.Tensor, Any]:
-        _ = kwargs
+        # Return int8 K×N weights + fp32 [1,N] scales for vLLM CUTLASS path.
+        q_kn, meta = self.quantize(weight)
         if device is not None:
-            weight = weight.to(device=device)
-        return self.quantize(weight)
+            q_kn = q_kn.to(device=device)
+            meta["scales"] = meta["scales"].to(device=device)
+        return q_kn, meta["scales"]
+
+    def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs: Any) -> torch.Tensor:
+        _ = kwargs
+        scales = scale_or_metadata.get("scales") if isinstance(scale_or_metadata, dict) else scale_or_metadata
+        if scales is None:
+            raise ValueError("scales required for dequantization")
+        raise RuntimeError(
+            "W8A8 不提供 dequantize 路径（避免走慢的 bf16 GEMM）。"
+        )
 
     def linear_forward(
         self,
@@ -214,262 +113,44 @@ def linear_forward(
     ) -> torch.Tensor:
         _ = quant_kind
 
-        quant_scales = kwargs.pop("quant_scales", None)
+        ops = _require_vllm_ops()
 
-        # Resolve / cache quantized weight + scales
-        if weight.dtype == torch.int8:
-            if quant_scales is None:
-                raise ValueError("weight is int8 but quant_scales is None; expected per-channel scales tensor")
-            qweight = weight if weight.device == x.device else weight.to(device=x.device)
-            w_scales = quant_scales
-            # Prefer fp16 scales for quality (and fused kernel expects fp16 scales).
-            if w_scales.dtype != torch.float16:
-                w_scales = w_scales.to(dtype=torch.float16)
-            if w_scales.device != x.device:
-                w_scales = w_scales.to(device=x.device)
-            weight_id = id(weight)
+        # If weight already quantized by LinearBase.load-time quantization.
+        quant_scales = kwargs.get("quant_scales", None)
+        if weight is not None and weight.dtype == torch.int8 and quant_scales is not None:
+            # Expected: qweight is K×N int8, quant_scales is [1,N] fp32
+            qweight = weight.to(device=x.device)
+            w_scales = quant_scales.to(device=x.device, dtype=torch.float32)
         else:
-            weight_id = id(weight)
-            cached = self._weight_cache.get(weight_id)
-            if cached is None:
-                qweight, w_scales = self.quantize_weight_for_kernel(weight, device=x.device)
-                self._weight_cache[weight_id] = (qweight, w_scales)
+            wid = id(weight)
+            cached = self._weight_cache.get(wid)
+            if cached is None or cached[0].device != x.device:
+                qweight, meta = self.quantize(weight)
+                qweight = qweight.to(device=x.device)
+                w_scales = meta["scales"].to(device=x.device, dtype=torch.float32)
+                self._weight_cache[wid] = (qweight, w_scales)
             else:
                 qweight, w_scales = cached
-                if qweight.device != x.device:
-                    qweight = qweight.to(device=x.device)
-                    w_scales = w_scales.to(device=x.device)
-                    self._weight_cache[weight_id] = (qweight, w_scales)
-
-        # Optional: use cuBLAS BF16 (dequant once)
-        if os.getenv("DIFFULEX_W8A8_PREFER_CUBLAS", "0") == "1":
-            deq_key = weight_id
-            deq_w = self._dequant_weight_cache.get(deq_key)
-            if deq_w is None or deq_w.device != x.device:
-                s = w_scales
-                if s.dim() == 1:
-                    s = s.unsqueeze(-1)
-                deq_w = (qweight.to(torch.float32) * s.to(torch.float32)).to(torch.bfloat16)
-                self._dequant_weight_cache[deq_key] = deq_w
-            return F.linear(x, deq_w, bias)
-
-        # Quantize activation per-row
-        if x.dtype not in (torch.bfloat16, torch.float16, torch.float32):
-            x = x.to(torch.bfloat16)
-        if x.dtype != torch.bfloat16:
-            x = x.to(torch.bfloat16)
-
-        # Try TileLang fused quant + GEMM first (bf16 activation input).
-        use_fused = os.getenv("DIFFULEX_W8A8_USE_TL_FUSED_GEMM", "1") == "1"
-        if (
-            use_fused
-            and _TILELANG_AVAILABLE
-            and (w8a8_fused_act_gemm is not None)
-            and x.is_cuda
-            and x.dtype == torch.bfloat16
-            and x.dim() == 2
-            and x.is_contiguous()
-        ):
-            try:
-                M, K = x.shape
-                N, K_w = qweight.shape
-                assert K == K_w, f"K dimension mismatch: {K} != {K_w}"
-
-                # Reduce TileLang JIT compilation churn using M-bucketing (similar to W8A16)
-                M_bucket = M
-                if M > 1:
-                    if M <= 64:
-                        M_bucket = 1 << (M - 1).bit_length()
-                    else:
-                        M_bucket = ((M + 63) // 64) * 64
-
-                x_for_kernel = x
-                if M_bucket != M:
-                    x_pad = torch.zeros((M_bucket, K), device=x.device, dtype=torch.bfloat16)
-                    x_pad[:M, :] = x
-                    x_for_kernel = x_pad
-
-                dev_idx = x.device.index or 0
-                cfg_key = (dev_idx, M_bucket, N, K)
-                cfg = self._tl_fused_cfg_cache.get(cfg_key)
-                kernel = None
-
-                # Only run autotune during warmup when autotuner inputs are available.
-                if cfg is None and is_warming_up() and set_autotune_inputs is not None:
-                    try:
-                        with set_autotune_inputs([x_for_kernel, qweight, w_scales]):
-                            kernel = w8a8_fused_act_gemm(M_bucket, N, K)
-                        # Only cache config if autotune succeeded (kernel has valid config)
-                        if hasattr(kernel, 'config') and kernel.config is not None:
-                            cfg = kernel.config
-                            self._tl_fused_cfg_cache[cfg_key] = cfg
-                    except Exception as autotune_err:
-                        # Autotune failed (e.g., all configs failed to compile), use default
-                        autotune_msg = str(autotune_err)
-                        if len(autotune_msg) > 150:
-                            autotune_msg = autotune_msg[:150] + "..."
-                        warnings.warn(
-                            f"W8A8 fused autotune failed ({autotune_msg}), using default config",
-                            UserWarning,
-                        )
-                        kernel = None
-
-                # Non-warmup path: keep deterministic behavior with a default config.
-                if cfg is None:
-                    cfg = _DEFAULT_TL_LINEAR_CFG
-
-                if kernel is None:
-                    kernel = w8a8_fused_act_gemm(M_bucket, N, K, **cfg)
-                out_full = kernel(x_for_kernel, qweight, w_scales)
-                out = out_full[:M, :] if M_bucket != M else out_full
-                if bias is not None:
-                    out = out + bias
-                return out
-            except Exception as e:
-                error_msg = str(e)
-                if len(error_msg) > 200:
-                    error_msg = error_msg[:200] + "..."
-                warnings.warn(
-                    f"W8A8 fused quant GEMM failed, falling back to quantize+GEMM: {error_msg}",
-                    UserWarning,
-                )
-
-        # Step-local cache for activation quantization (reuse within one step for QKV/gate-up, etc.)
-        use_cache = os.getenv("DIFFULEX_W8A8_ACT_QUANT_CACHE", "1") == "1"
-        cached = None
-        if use_cache:
-            try:
-                from diffulex.utils.quantization.context import get_cached_act_quant, set_cached_act_quant
-                cached = get_cached_act_quant(x)
-            except Exception:
-                cached = None
-        if cached is not None:
-            x_q, x_scales = cached
-        else:
-            x_q, x_scales = _quantize_per_row_int8(x)
-            if use_cache:
-                try:
-                    set_cached_act_quant(x, x_q, x_scales)
-                except Exception:
-                    pass
-        if x_q.device != x.device:
-            x_q = x_q.to(device=x.device)
-            x_scales = x_scales.to(device=x.device)
-
-        # Get shapes
-        M, K = x_q.shape
-        N, K_w = qweight.shape
-        assert K == K_w, f"K dimension mismatch: {K} != {K_w}"
-
-        # Try TileLang kernel first if available
-        if _TILELANG_AVAILABLE and (w8a8_scaled_gemm is not None or w8a8_gemm is not None):
-            try:
-                # Check device
-                if x.device.type != 'cuda':
-                    # Fall through to _int8_mm fallback
-                    pass
-                else:
-                    # Reduce TileLang JIT compilation churn using M-bucketing (similar to W8A16)
-                    M_bucket = M
-                    if M > 1:
-                        if M <= 64:
-                            M_bucket = 1 << (M - 1).bit_length()
-                        else:
-                            M_bucket = ((M + 63) // 64) * 64
-
-                    x_q_for_kernel = x_q
-                    if M_bucket != M:
-                        x_pad = torch.zeros((M_bucket, K), device=x.device, dtype=torch.int8)
-                        x_pad[:M, :] = x_q
-                        x_q_for_kernel = x_pad
-                        x_scales_pad = torch.zeros((M_bucket,), device=x.device, dtype=torch.float32)
-                        x_scales_pad[:M] = x_scales.to(torch.float32)
-                        x_scales_for_kernel = x_scales_pad
-                    else:
-                        x_scales_for_kernel = x_scales.to(torch.float32)
-
-                    # Prefer fused-scale kernel: outputs bf16 directly, avoiding large int32->fp32 postprocessing.
-                    if w8a8_scaled_gemm is not None:
-                        kernel = w8a8_scaled_gemm(
-                            M_bucket,
-                            N,
-                            K,
-                            block_M=64,
-                            block_N=64,
-                            block_K=128,
-                            num_stages=2,
-                            threads=128,
-                        )
-                        out_full = kernel(x_q_for_kernel, qweight, x_scales_for_kernel, w_scales)
-                        out = out_full[:M, :] if M_bucket != M else out_full
-                    else:
-                        # Fallback to int32-output kernel + python scaling
-                        kernel = w8a8_gemm(
-                            M_bucket,
-                            N,
-                            K,
-                            block_M=64,
-                            block_N=64,
-                            block_K=128,
-                            num_stages=2,
-                            threads=128,
-                        )
-                        out_i32_full = kernel(x_q_for_kernel, qweight)
-                        out_i32 = out_i32_full[:M, :] if M_bucket != M else out_i32_full
-
-                        out_fp32 = out_i32.to(torch.float32)
-                        out_fp32 = out_fp32 * x_scales.to(torch.float32).unsqueeze(-1)
-                        out_fp32 = out_fp32 * w_scales.to(torch.float32).unsqueeze(0)
-                        out = out_fp32.to(torch.bfloat16)
-
-                    if bias is not None:
-                        out = out + bias
-                    return out
-            except Exception as e:
-                # Fallback to _int8_mm on any kernel error
-                import warnings
-                error_msg = str(e)
-                if len(error_msg) > 200:
-                    error_msg = error_msg[:200] + "..."
-                warnings.warn(f"W8A8 TileLang kernel failed, falling back to torch._int_mm: {error_msg}", UserWarning)
-
-        # Fallback: use torch._int_mm
-        # Prepare weight transpose for int8 GEMM: [N,K] -> [K,N]
-        wt = self._weight_t_cache.get(weight_id)
-        if wt is None or wt.device != x.device:
-            wt = qweight.t().contiguous()
-            self._weight_t_cache[weight_id] = wt
-
-        # Some builds require M > 16 for int8 GEMM; pad minimally.
-        if M <= 16:
-            M_bucket = 17
-            x_pad = torch.zeros((M_bucket, K), device=x.device, dtype=torch.int8)
-            x_pad[:M, :] = x_q
-            x_q_for_mm = x_pad
-        else:
-            x_q_for_mm = x_q
-
-        try:
-            out_i32_full = _int8_mm(x_q_for_mm, wt)  # [M_bucket, N] int32
-        except Exception as e:
-            # Fallback: dequant + BF16 GEMM
-            msg = str(e)
-            if len(msg) > 200:
-                msg = msg[:200] + "..."
-            warnings.warn(f"W8A8 int8 GEMM failed, falling back to BF16 F.linear: {msg}", UserWarning)
-            deq_w = self.dequantize(qweight, w_scales)
-            return F.linear(x, deq_w, bias)
-
-        out_i32 = out_i32_full[:M, :] if M <= 16 else out_i32_full
-
-        # Apply scales: int32 * x_scale[m] * w_scale[n]
-        out_fp32 = out_i32.to(torch.float32)
-        out_fp32 = out_fp32 * x_scales.to(torch.float32).unsqueeze(-1)
-        out_fp32 = out_fp32 * w_scales.to(torch.float32).unsqueeze(0)
-        out = out_fp32.to(torch.bfloat16)
-
-        if bias is not None:
-            out = out + bias
-        return out
 
+        # Flatten like torch.nn.functional.linear
+        orig_shape = x.shape
+        x2 = x.reshape(-1, x.shape[-1]) if x.dim() != 2 else x
+        if x2.dtype not in (torch.bfloat16, torch.float16):
+            x2 = x2.to(torch.bfloat16)
+        # dynamic per-token int8 quant + fused GEMM_DQ
+        x_q, x_s, _ = ops.scaled_int8_quant(x2.contiguous(), scale=None, azp=None, symmetric=True)
+        y = ops.cutlass_scaled_mm(
+            x_q,
+            qweight,
+            scale_a=x_s,
+            scale_b=w_scales,
+            out_dtype=x2.dtype,
+            bias=bias.to(dtype=x2.dtype) if bias is not None else None,
+        )
+
+        if orig_shape == x2.shape:
+            return y
+        if x.dim() == 1:
+            return y.squeeze(0)
+        return y.reshape(*orig_shape[:-1], y.shape[-1])
 
diff --git a/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py b/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py
index 54eb97d..1cd8eb1 100644
--- a/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py
+++ b/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py
@@ -1,19 +1,14 @@
-"""
-Marlin-style (vLLM AllSpark) W8A16 Linear quantization strategy.
-
-Goal:
-- Replace Diffulex current W8A16 path (TileLang kernel that casts int8->bf16 inside)
-  with a vLLM-like fused path for decode small-M:
-  - per-out-channel int8 quantization (stored as uint8 with +128 bias)
-  - one-time N32K16 reorder (AllSpark repack)
-  - fused dequant + GEMM kernel (AllSpark w8a16 gemm)
-
-Notes:
-- Despite the filename mentioning "marlin", the actual fused kernel we vendor is
-  vLLM's AllSpark Ampere W8A16 fused GEMM, which is the effective INT8 W8A16
-  fast path in vLLM for this use-case.
-- Fallback behavior is critical: if the extension is unavailable, or shapes are
-  unsupported (e.g., K%16!=0), we fall back to existing TileLang W8A16 or BF16.
+"""W8A16 Linear quantization strategy using vLLM custom ops.
+
+This strategy uses vLLM's fused AllSpark W8A16 path via `vllm._custom_ops`:
+- per-out-channel int8 quantization stored as uint8 (+128 bias)
+- one-time N32K16 reorder (AllSpark repack)
+- fused dequant + GEMM (AllSpark w8a16 gemm)
+
+Important:
+- We intentionally do NOT vendor/compile a local AllSpark/Marlin extension in
+  Diffulex anymore. If `vllm._custom_ops` is unavailable, this strategy fails
+  fast (instead of silently compiling or falling back to a slow/oom-prone path).
 """
 
 from __future__ import annotations
@@ -27,27 +22,37 @@
 from diffulex.utils.quantization.registry import register_linear_strategy
 from diffulex.utils.quantization.strategy import LinearQuantizationStrategy
 
-# Optional: existing TileLang fallback (already used by linear_int8_w8a16.py)
 try:
-    from diffulex_kernel.python.linear_kernels import w8a16_gemm as _tilelang_w8a16_gemm
-    _TILELANG_AVAILABLE = True
+    import vllm._custom_ops as _vllm_ops
 except Exception:
-    _tilelang_w8a16_gemm = None
-    _TILELANG_AVAILABLE = False
+    _vllm_ops = None
 
-# Vendored vLLM-style fused W8A16 (AllSpark) ops.
-try:
-    from diffulex_kernel.python.marlin_ops import (  # noqa: F401
-        allspark_w8a16_gemm as _allspark_w8a16_gemm,
-        rearrange_kn_weight_as_n32k16_order as _allspark_repack,
-        is_available as _allspark_is_available,
+
+def _allspark_is_available() -> bool:
+    return bool(
+        _vllm_ops is not None
+        and hasattr(_vllm_ops, "allspark_w8a16_gemm")
+        and hasattr(_vllm_ops, "allspark_repack_weight")
     )
-except Exception:
-    _allspark_w8a16_gemm = None
-    _allspark_repack = None
 
-    def _allspark_is_available() -> bool:
-        return False
+
+def _allspark_w8a16_gemm(*args, **kwargs):
+    if _vllm_ops is None or not hasattr(_vllm_ops, "allspark_w8a16_gemm"):
+        raise RuntimeError("vLLM custom ops are unavailable: missing `allspark_w8a16_gemm`.")
+    return _vllm_ops.allspark_w8a16_gemm(*args, **kwargs)
+
+
+def _allspark_repack_weight(b_qweight_kn: torch.Tensor, scales_1xn: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    """Repack KxN uint8 qweight + 1xN scales into (N_32,K) + (1,N_32) for AllSpark GEMM."""
+    if _vllm_ops is None or not hasattr(_vllm_ops, "allspark_repack_weight"):
+        raise RuntimeError("vLLM custom ops are unavailable: missing `allspark_repack_weight`.")
+    q_reorder, s_reorder, _ = _vllm_ops.allspark_repack_weight(
+        b_qweight_kn,
+        scales_1xn,
+        None,
+        False,
+    )
+    return q_reorder, s_reorder
 
 
 @register_linear_strategy(weight_dtype="marlin_int8", act_dtype="bf16")
@@ -56,7 +61,7 @@ def _build_linear_marlin_int8_w8a16() -> LinearQuantizationStrategy:
 
 
 class LinearMarlinInt8W8A16Strategy(LinearQuantizationStrategy):
-    """W8A16 strategy using vendored vLLM AllSpark fused GEMM + repack."""
+    """W8A16 strategy using vLLM custom ops (AllSpark fused GEMM + repack)."""
 
     def __init__(self) -> None:
         super().__init__()
@@ -65,7 +70,10 @@ def __init__(self) -> None:
 
     @property
     def name(self) -> str:
-        return "linear_marlin_int8_w8a16"
+        # NOTE: Keep strategy naming consistent with the public W8A16 INT8 path.
+        # The underlying implementation is a Marlin/AllSpark-style fused kernel,
+        # but the user-facing strategy name should not be tied to a particular kernel brand.
+        return "linear_int8_w8a16"
 
     @property
     def linear_weight_format(self) -> str:
@@ -148,44 +156,54 @@ def quantize_weight_for_kernel(
         abs_max = torch.abs(weight).max(dim=-1)[0]  # [N]
         scales = (abs_max.clamp(min=1e-8) / 127.0).to(dtype=torch.bfloat16)  # [N]
 
-        # Quantize to signed int8, then store as uint8 with +128 bias.
-        w_fp32 = weight.to(torch.float32)
-        s_fp32 = scales.to(torch.float32).unsqueeze(-1)  # [N,1]
-        q_i8 = torch.round(w_fp32 / s_fp32).clamp(-128, 127).to(torch.int16)  # [N,K]
-        q_u8 = (q_i8 + 128).to(torch.uint8)  # [N,K] in [0,255]
-
-        if not _allspark_is_available() or _allspark_repack is None:
-            # Fallback storage (no reorder). Keep [N,K] and [N].
+        # IMPORTANT (OOM fix):
+        # Avoid allocating a full [N,K] fp32 copy (and an extra transpose buffer).
+        # Quantize in small row blocks and (when using AllSpark) write directly into
+        # the repack input layout B_kn=[K,N], so we never materialize q_u8 + transpose.
+        try:
+            block_n = int(os.getenv("DIFFULEX_W8A16_QUANT_BLOCK_N", "256"))
+        except Exception:
+            block_n = 256
+        block_n = max(1, block_n)
+
+        use_allspark = _allspark_is_available()
+        if use_allspark:
+            # AllSpark repack expects B in (K,N) contiguous layout.
+            b_kn = torch.empty((k, n), device=weight.device, dtype=torch.uint8)  # [K,N]
+            for i in range(0, n, block_n):
+                j = min(i + block_n, n)
+                w_blk = weight[i:j, :]  # [B,K]
+                s_blk = scales[i:j].unsqueeze(-1)  # [B,1]
+                # Quantize to signed int in bf16 to minimize temporary memory.
+                q_i16 = torch.round(w_blk / s_blk).clamp(-128, 127).to(torch.int16)  # [B,K]
+                q_u8_blk = (q_i16 + 128).to(torch.uint8)  # [B,K]
+                # Write directly into [K,N] buffer.
+                b_kn[:, i:j] = q_u8_blk.transpose(0, 1)
+        else:
+            # Fallback storage (no reorder). Keep [N,K] and [N] (padded to N_32).
             # Note: forward will detect unavailable allspark and fallback further.
+            q_pad = torch.full((n_32, k), 128, device=weight.device, dtype=torch.uint8)
+            for i in range(0, n, block_n):
+                j = min(i + block_n, n)
+                w_blk = weight[i:j, :]  # [B,K]
+                s_blk = scales[i:j].unsqueeze(-1)  # [B,1]
+                q_i16 = torch.round(w_blk / s_blk).clamp(-128, 127).to(torch.int16)  # [B,K]
+                q_pad[i:j, :] = (q_i16 + 128).to(torch.uint8)
             if n_32 != n:
-                q_pad = torch.full((n_32, k), 128, device=q_u8.device, dtype=torch.uint8)
-                q_pad[:n, :] = q_u8
                 s_pad = torch.zeros((n_32,), device=scales.device, dtype=torch.bfloat16)
                 s_pad[:n] = scales
                 return q_pad.contiguous(), s_pad.contiguous()
-            return q_u8.contiguous(), scales.contiguous()
-
-        # AllSpark repack expects B in (K,N) contiguous layout.
-        b_kn = q_u8.transpose(0, 1).contiguous()  # [K,N]
-
-        q_reorder = torch.empty((n_32, k), device=b_kn.device, dtype=torch.uint8)
-        s_reorder = torch.empty((n_32,), device=scales.device, dtype=torch.bfloat16)
+            return q_pad[:n, :].contiguous(), scales.contiguous()
 
-        # No zero-point path for symmetric signed int8 (bias128 already handled).
-        _allspark_repack(
-            b_kn,
-            scales.contiguous(),
-            None,
-            False,  # has_zp
-            q_reorder,
-            s_reorder,
-            None,
-            int(k),
-            int(n),
-            int(n_32),
+        # vLLM expects scales in [1, N] layout for repack.
+        q_reorder, s_reorder_1xn = _allspark_repack_weight(
+            b_kn.contiguous(),
+            scales.unsqueeze(0).contiguous(),
         )
 
-        return q_reorder.contiguous(), s_reorder.contiguous()
+        # Store scales as 1D for LinearBase buffers; linear_forward will reshape as needed.
+        s_1d = s_reorder_1xn.reshape(-1).to(dtype=torch.bfloat16)
+        return q_reorder.contiguous(), s_1d.contiguous()
 
     def quantize_act_for_kernel(
         self,
@@ -254,9 +272,15 @@ def linear_forward(
             else:
                 qweight, scales = cached
 
-        # If fused kernel isn't available, fall back to TileLang or BF16.
-        if _allspark_w8a16_gemm is None or not _allspark_is_available():
-            return self._fallback(x, weight, qweight, scales, bias)
+        # If fused kernel isn't available, fall back to BF16 only if original weight exists;
+        # otherwise fail fast (do NOT dequantize a full matrix, which is memory-prohibitive).
+        if not _allspark_is_available():
+            if weight is not None and getattr(weight, "dtype", None) in (torch.float16, torch.bfloat16):
+                return F.linear(x, weight, bias)
+            raise RuntimeError(
+                "vLLM AllSpark W8A16 fused kernel is unavailable, and bf16 weight is not present. "
+                "Please ensure vLLM custom ops are installed and loadable (`import vllm._custom_ops`)."
+            )
 
         # AllSpark kernel requires CUDA and contiguous inputs.
         if x2.device.type != "cuda":
@@ -283,10 +307,12 @@ def linear_forward(
         sm_count, sm_version = self._get_sm_info(x2.device)
         cublas_thr = self._cublas_m_threshold()
 
+        # vLLM allspark expects scales as 1xN (or equivalent contiguous view).
+        scales_1xn = scales.reshape(1, -1).contiguous()
         y2 = _allspark_w8a16_gemm(
             x2.contiguous(),
             qweight.contiguous(),
-            scales.contiguous(),
+            scales_1xn,
             None,  # b_qzeros
             n,
             -1,  # group_size (only supports -1)
@@ -308,49 +334,6 @@ def linear_forward(
             y = y2.reshape(*orig_shape[:-1], y2.shape[-1])
         return y
 
-    def _fallback(
-        self,
-        x: torch.Tensor,
-        weight: torch.Tensor,
-        qweight: torch.Tensor,
-        scales: torch.Tensor,
-        bias: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        # Prefer existing TileLang W8A16 if available and inputs are CUDA.
-        if _TILELANG_AVAILABLE and _tilelang_w8a16_gemm is not None and x.device.type == "cuda":
-            try:
-                x2 = x if x.dim() == 2 else x.reshape(-1, x.shape[-1])
-                # TileLang expects int8 weight. If our qweight is uint8 bias128, convert to int8 on the fly.
-                if qweight.dtype == torch.uint8:
-                    q_i8 = (qweight.to(torch.int16) - 128).to(torch.int8)
-                else:
-                    q_i8 = qweight
-                y2 = _tilelang_w8a16_gemm(x2, q_i8, scales, False)
-                if bias is not None:
-                    y2 = y2 + bias
-                if x.dim() == 2:
-                    return y2
-                if x.dim() == 1:
-                    return y2.squeeze(0)
-                return y2.reshape(*x.shape[:-1], y2.shape[-1])
-            except Exception:
-                pass
-
-        # Last resort: BF16 F.linear using dequantized weight if bf16 is available.
-        if weight is not None and getattr(weight, "dtype", None) in (torch.float16, torch.bfloat16):
-            return F.linear(x, weight, bias)
-
-        # Dequantize from qweight + scales and use cuBLAS via F.linear.
-        # qweight may be [N_32,K] or reordered; we cannot reliably undo reorder here.
-        # So only attempt this if qweight looks like plain [N,K] (no padding).
-        if qweight.dim() == 2 and scales.dim() == 1 and qweight.shape[0] == scales.shape[0]:
-            if qweight.dtype == torch.uint8:
-                q = (qweight.to(torch.int16) - 128).to(torch.int8)
-            else:
-                q = qweight
-            s = scales.unsqueeze(-1).to(torch.float32)
-            w_deq = (q.to(torch.float32) * s).to(torch.bfloat16)
-            return F.linear(x, w_deq, bias)
-
-        raise RuntimeError("AllSpark/TileLang unavailable and safe fallback path not found for marlin_int8 W8A16.")
+    # NOTE: We intentionally do not provide a generic dequantize+F.linear fallback for reordered weights.
+    # It materializes a full bf16 matrix and is prone to OOM on large models.
 
diff --git a/diffulex_bench/configs/awq_bf16kv_varlen.yml b/diffulex_bench/configs/awq_bf16kv_varlen.yml
new file mode 100644
index 0000000..62c2cb8
--- /dev/null
+++ b/diffulex_bench/configs/awq_bf16kv_varlen.yml
@@ -0,0 +1,47 @@
+# AWQ (W4A16) + BF16 KV Cache (varlen mode)
+engine:
+  model_path: "/data1/lzx/Dream-v0-Base-7B-awq"
+  tokenizer_path: null
+  model_name: "dream"
+  decoding_strategy: "d2f"
+  mask_token_id: 151666
+  
+  use_lora: false
+  lora_path: ""
+  
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  
+  gpu_memory_utilization: 0.7
+  max_model_len: 2048
+  max_num_batched_tokens: 4096
+  max_num_seqs: 128
+  
+  enforce_eager: true  # Required for varlen mode
+  kv_cache_layout: "unified"
+  
+  accept_threshold: 0.9
+  complete_threshold: 0.95
+  add_new_block_threshold: 0.1
+  diffusion_block_size: 32
+  
+  # Quantization: AWQ (W4A16) + BF16 activations + BF16 KV cache
+  kv_cache_dtype: "bf16"
+  decode_mode: "varlen"
+  linear_attn_weight_dtype: "awq"
+  linear_mlp_weight_dtype: "awq"
+  linear_attn_act_dtype: "bf16"
+  linear_mlp_act_dtype: "bf16"
+
+eval:
+  dataset_name: "gsm8k"
+  dataset_split: "test"
+  dataset_limit: 10
+  
+  temperature: 0.0
+  max_tokens: 512
+  ignore_eos: false
+  
+  output_dir: "benchmark_results_varlen/awq_bf16kv"
+  save_results: true
+  use_tqdm: true
diff --git a/diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml b/diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml
new file mode 100644
index 0000000..8c76f4e
--- /dev/null
+++ b/diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml
@@ -0,0 +1,48 @@
+# AWQ Marlin (W4, A16) + BF16 KV Cache (varlen mode)
+engine:
+  model_path: "/data1/lzx/Dream-v0-Base-7B-awq"
+  tokenizer_path: null
+  model_name: "dream"
+  decoding_strategy: "d2f"
+  mask_token_id: 151666
+  
+  use_lora: false
+  lora_path: ""
+  
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  
+  gpu_memory_utilization: 0.7
+  max_model_len: 2048
+  max_num_batched_tokens: 4096
+  max_num_seqs: 128
+  
+  enforce_eager: true  # Required for varlen mode
+  kv_cache_layout: "unified"
+  
+  accept_threshold: 0.9
+  complete_threshold: 0.95
+  add_new_block_threshold: 0.1
+  diffusion_block_size: 32
+  
+  # Quantization: AWQ Marlin + BF16 activations + BF16 KV cache
+  kv_cache_dtype: "bf16"
+  decode_mode: "varlen"
+  linear_attn_weight_dtype: "awq_marlin"
+  linear_mlp_weight_dtype: "awq_marlin"
+  linear_attn_act_dtype: "bf16"
+  linear_mlp_act_dtype: "bf16"
+
+eval:
+  dataset_name: "gsm8k"
+  dataset_split: "test"
+  dataset_limit: 10
+  
+  temperature: 0.0
+  max_tokens: 512
+  ignore_eos: false
+  
+  output_dir: "benchmark_results_varlen/awq_marlin_bf16kv"
+  save_results: true
+  use_tqdm: true
+
diff --git a/diffulex_bench/configs/fp8_bf16kv_varlen.yml b/diffulex_bench/configs/fp8_bf16kv_varlen.yml
new file mode 100644
index 0000000..2ac105b
--- /dev/null
+++ b/diffulex_bench/configs/fp8_bf16kv_varlen.yml
@@ -0,0 +1,48 @@
+# FP8 Linear (vLLM) + BF16 KV Cache (varlen mode)
+engine:
+  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
+  tokenizer_path: null
+  model_name: "dream"
+  decoding_strategy: "d2f"
+  mask_token_id: 151666
+  
+  use_lora: false
+  lora_path: ""
+  
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  
+  gpu_memory_utilization: 0.7
+  max_model_len: 2048
+  max_num_batched_tokens: 4096
+  max_num_seqs: 128
+  
+  enforce_eager: true  # Required for varlen mode
+  kv_cache_layout: "unified"
+  
+  accept_threshold: 0.9
+  complete_threshold: 0.95
+  add_new_block_threshold: 0.1
+  diffusion_block_size: 32
+  
+  # Quantization: FP8 weights (vLLM ops) + BF16 KV cache
+  kv_cache_dtype: "bf16"
+  decode_mode: "varlen"
+  linear_attn_weight_dtype: "fp8"
+  linear_mlp_weight_dtype: "fp8"
+  linear_attn_act_dtype: "bf16"
+  linear_mlp_act_dtype: "bf16"
+
+eval:
+  dataset_name: "gsm8k"
+  dataset_split: "test"
+  dataset_limit: 10
+  
+  temperature: 0.0
+  max_tokens: 512
+  ignore_eos: false
+  
+  output_dir: "benchmark_results_varlen/fp8_bf16kv"
+  save_results: true
+  use_tqdm: true
+
diff --git a/diffulex_bench/configs/gptq_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_bf16kv_varlen.yml
new file mode 100644
index 0000000..b7fd14d
--- /dev/null
+++ b/diffulex_bench/configs/gptq_bf16kv_varlen.yml
@@ -0,0 +1,47 @@
+# GPTQ (W4A16) + BF16 KV Cache (varlen mode)
+engine:
+  model_path: "/data1/lzx/Dream-v0-Base-7B-gptq"
+  tokenizer_path: null
+  model_name: "dream"
+  decoding_strategy: "d2f"
+  mask_token_id: 151666
+  
+  use_lora: false
+  lora_path: ""
+  
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  
+  gpu_memory_utilization: 0.7
+  max_model_len: 2048
+  max_num_batched_tokens: 4096
+  max_num_seqs: 128
+  
+  enforce_eager: true  # Required for varlen mode
+  kv_cache_layout: "unified"
+  
+  accept_threshold: 0.9
+  complete_threshold: 0.95
+  add_new_block_threshold: 0.1
+  diffusion_block_size: 32
+  
+  # Quantization: GPTQ (W4A16) + BF16 activations + BF16 KV cache
+  kv_cache_dtype: "bf16"
+  decode_mode: "varlen"
+  linear_attn_weight_dtype: "gptq"
+  linear_mlp_weight_dtype: "gptq"
+  linear_attn_act_dtype: "bf16"
+  linear_mlp_act_dtype: "bf16"
+
+eval:
+  dataset_name: "gsm8k"
+  dataset_split: "test"
+  dataset_limit: 10
+  
+  temperature: 0.0
+  max_tokens: 512
+  ignore_eos: false
+  
+  output_dir: "benchmark_results_varlen/gptq_bf16kv"
+  save_results: true
+  use_tqdm: true
\ No newline at end of file
diff --git a/diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml b/diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml
new file mode 100644
index 0000000..1505192
--- /dev/null
+++ b/diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml
@@ -0,0 +1,47 @@
+# GPTQ (W4A16) + BF16 KV Cache (varlen mode)
+engine:
+  model_path: "/data1/lzx/Dream-v0-Base-7B-gptq"
+  tokenizer_path: null
+  model_name: "dream"
+  decoding_strategy: "d2f"
+  mask_token_id: 151666
+  
+  use_lora: false
+  lora_path: ""
+  
+  tensor_parallel_size: 2
+  data_parallel_size: 1
+  
+  gpu_memory_utilization: 0.7
+  max_model_len: 2048
+  max_num_batched_tokens: 4096
+  max_num_seqs: 128
+  
+  enforce_eager: true  # Required for varlen mode
+  kv_cache_layout: "unified"
+  
+  accept_threshold: 0.9
+  complete_threshold: 0.95
+  add_new_block_threshold: 0.1
+  diffusion_block_size: 32
+  
+  # Quantization: GPTQ (W4A16) + BF16 activations + BF16 KV cache
+  kv_cache_dtype: "bf16"
+  decode_mode: "varlen"
+  linear_attn_weight_dtype: "gptq"
+  linear_mlp_weight_dtype: "gptq"
+  linear_attn_act_dtype: "bf16"
+  linear_mlp_act_dtype: "bf16"
+
+eval:
+  dataset_name: "gsm8k"
+  dataset_split: "test"
+  dataset_limit: 10
+  
+  temperature: 0.0
+  max_tokens: 512
+  ignore_eos: false
+  
+  output_dir: "benchmark_results_varlen/gptq_bf16kv"
+  save_results: true
+  use_tqdm: true
\ No newline at end of file
diff --git a/diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml
new file mode 100644
index 0000000..858b31a
--- /dev/null
+++ b/diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml
@@ -0,0 +1,48 @@
+# GPTQ Marlin (W4/W8, A16) + BF16 KV Cache (varlen mode)
+engine:
+  model_path: "/data1/lzx/Dream-v0-Base-7B-gptq"
+  tokenizer_path: null
+  model_name: "dream"
+  decoding_strategy: "d2f"
+  mask_token_id: 151666
+  
+  use_lora: false
+  lora_path: ""
+  
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  
+  gpu_memory_utilization: 0.7
+  max_model_len: 2048
+  max_num_batched_tokens: 4096
+  max_num_seqs: 128
+  
+  enforce_eager: true  # Required for varlen mode
+  kv_cache_layout: "unified"
+  
+  accept_threshold: 0.9
+  complete_threshold: 0.95
+  add_new_block_threshold: 0.1
+  diffusion_block_size: 32
+  
+  # Quantization: GPTQ Marlin + BF16 activations + BF16 KV cache
+  kv_cache_dtype: "bf16"
+  decode_mode: "varlen"
+  linear_attn_weight_dtype: "gptq_marlin"
+  linear_mlp_weight_dtype: "gptq_marlin"
+  linear_attn_act_dtype: "bf16"
+  linear_mlp_act_dtype: "bf16"
+
+eval:
+  dataset_name: "gsm8k"
+  dataset_split: "test"
+  dataset_limit: 10
+  
+  temperature: 0.0
+  max_tokens: 512
+  ignore_eos: false
+  
+  output_dir: "benchmark_results_varlen/gptq_marlin_bf16kv"
+  save_results: true
+  use_tqdm: true
+
diff --git a/diffulex_bench/configs/gptq_marlin_w2_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_marlin_w2_bf16kv_varlen.yml
new file mode 100644
index 0000000..bae9875
--- /dev/null
+++ b/diffulex_bench/configs/gptq_marlin_w2_bf16kv_varlen.yml
@@ -0,0 +1,47 @@
+# GPTQ Marlin (W2, A16) + BF16 KV Cache (varlen mode)
+engine:
+  model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-w2"
+  tokenizer_path: null
+  model_name: "dream"
+  decoding_strategy: "d2f"
+  mask_token_id: 151666
+  
+  use_lora: false
+  lora_path: ""
+  
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  
+  gpu_memory_utilization: 0.7
+  max_model_len: 2048
+  max_num_batched_tokens: 4096
+  max_num_seqs: 128
+  
+  enforce_eager: true  # Required for varlen mode
+  kv_cache_layout: "unified"
+  
+  accept_threshold: 0.9
+  complete_threshold: 0.95
+  add_new_block_threshold: 0.1
+  diffusion_block_size: 32
+  
+  # Quantization: GPTQ Marlin (W2) + BF16 activations + BF16 KV cache
+  kv_cache_dtype: "bf16"
+  decode_mode: "varlen"
+  linear_attn_weight_dtype: "gptq_marlin"
+  linear_mlp_weight_dtype: "gptq_marlin"
+  linear_attn_act_dtype: "bf16"
+  linear_mlp_act_dtype: "bf16"
+
+eval:
+  dataset_name: "gsm8k"
+  dataset_split: "test"
+  dataset_limit: 10
+  
+  temperature: 0.0
+  max_tokens: 512
+  ignore_eos: false
+  
+  output_dir: "benchmark_results_varlen/gptq_marlin_w2_bf16kv"
+  save_results: true
+  use_tqdm: true
diff --git a/diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml
new file mode 100644
index 0000000..f8265d3
--- /dev/null
+++ b/diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml
@@ -0,0 +1,47 @@
+# GPTQ Marlin (W4, A16) + BF16 KV Cache (varlen mode)
+engine:
+  model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-marlin-w4"
+  tokenizer_path: null
+  model_name: "dream"
+  decoding_strategy: "d2f"
+  mask_token_id: 151666
+  
+  use_lora: false
+  lora_path: ""
+  
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  
+  gpu_memory_utilization: 0.7
+  max_model_len: 2048
+  max_num_batched_tokens: 4096
+  max_num_seqs: 128
+  
+  enforce_eager: true  # Required for varlen mode
+  kv_cache_layout: "unified"
+  
+  accept_threshold: 0.9
+  complete_threshold: 0.95
+  add_new_block_threshold: 0.1
+  diffusion_block_size: 32
+  
+  # Quantization: GPTQ Marlin (W4) + BF16 activations + BF16 KV cache
+  kv_cache_dtype: "bf16"
+  decode_mode: "varlen"
+  linear_attn_weight_dtype: "gptq_marlin"
+  linear_mlp_weight_dtype: "gptq_marlin"
+  linear_attn_act_dtype: "bf16"
+  linear_mlp_act_dtype: "bf16"
+
+eval:
+  dataset_name: "gsm8k"
+  dataset_split: "test"
+  dataset_limit: 10
+  
+  temperature: 0.0
+  max_tokens: 512
+  ignore_eos: false
+  
+  output_dir: "benchmark_results_varlen/gptq_marlin_w4_bf16kv"
+  save_results: true
+  use_tqdm: true
diff --git a/diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml
new file mode 100644
index 0000000..e20c9be
--- /dev/null
+++ b/diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml
@@ -0,0 +1,47 @@
+# GPTQ Marlin (W8, A16) + BF16 KV Cache (varlen mode)
+engine:
+  model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-marlin-w8"
+  tokenizer_path: null
+  model_name: "dream"
+  decoding_strategy: "d2f"
+  mask_token_id: 151666
+  
+  use_lora: false
+  lora_path: ""
+  
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  
+  gpu_memory_utilization: 0.7
+  max_model_len: 2048
+  max_num_batched_tokens: 4096
+  max_num_seqs: 128
+  
+  enforce_eager: true  # Required for varlen mode
+  kv_cache_layout: "unified"
+  
+  accept_threshold: 0.9
+  complete_threshold: 0.95
+  add_new_block_threshold: 0.1
+  diffusion_block_size: 32
+  
+  # Quantization: GPTQ Marlin (W8) + BF16 activations + BF16 KV cache
+  kv_cache_dtype: "bf16"
+  decode_mode: "varlen"
+  linear_attn_weight_dtype: "gptq_marlin"
+  linear_mlp_weight_dtype: "gptq_marlin"
+  linear_attn_act_dtype: "bf16"
+  linear_mlp_act_dtype: "bf16"
+
+eval:
+  dataset_name: "gsm8k"
+  dataset_split: "test"
+  dataset_limit: 10
+  
+  temperature: 0.0
+  max_tokens: 512
+  ignore_eos: false
+  
+  output_dir: "benchmark_results_varlen/gptq_marlin_w8_bf16kv"
+  save_results: true
+  use_tqdm: true
diff --git a/diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml
new file mode 100644
index 0000000..03fe3e7
--- /dev/null
+++ b/diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml
@@ -0,0 +1,47 @@
+# GPTQ (W2A16) + BF16 KV Cache (varlen mode)
+engine:
+  model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-w2"
+  tokenizer_path: null
+  model_name: "dream"
+  decoding_strategy: "d2f"
+  mask_token_id: 151666
+  
+  use_lora: false
+  lora_path: ""
+  
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  
+  gpu_memory_utilization: 0.7
+  max_model_len: 2048
+  max_num_batched_tokens: 4096
+  max_num_seqs: 128
+  
+  enforce_eager: true  # Required for varlen mode
+  kv_cache_layout: "unified"
+  
+  accept_threshold: 0.9
+  complete_threshold: 0.95
+  add_new_block_threshold: 0.1
+  diffusion_block_size: 32
+  
+  # Quantization: GPTQ (W2A16) + BF16 activations + BF16 KV cache
+  kv_cache_dtype: "bf16"
+  decode_mode: "varlen"
+  linear_attn_weight_dtype: "gptq"
+  linear_mlp_weight_dtype: "gptq"
+  linear_attn_act_dtype: "bf16"
+  linear_mlp_act_dtype: "bf16"
+
+eval:
+  dataset_name: "gsm8k"
+  dataset_split: "test"
+  dataset_limit: 10
+  
+  temperature: 0.0
+  max_tokens: 512
+  ignore_eos: false
+  
+  output_dir: "benchmark_results_varlen/gptq_w2_bf16kv"
+  save_results: true
+  use_tqdm: true
diff --git a/diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml
new file mode 100644
index 0000000..1f68616
--- /dev/null
+++ b/diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml
@@ -0,0 +1,47 @@
+# GPTQ (W8A16) + BF16 KV Cache (varlen mode)
+engine:
+  model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-w8"
+  tokenizer_path: null
+  model_name: "dream"
+  decoding_strategy: "d2f"
+  mask_token_id: 151666
+  
+  use_lora: false
+  lora_path: ""
+  
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  
+  gpu_memory_utilization: 0.7
+  max_model_len: 2048
+  max_num_batched_tokens: 4096
+  max_num_seqs: 128
+  
+  enforce_eager: true  # Required for varlen mode
+  kv_cache_layout: "unified"
+  
+  accept_threshold: 0.9
+  complete_threshold: 0.95
+  add_new_block_threshold: 0.1
+  diffusion_block_size: 32
+  
+  # Quantization: GPTQ (W8A16) + BF16 activations + BF16 KV cache
+  kv_cache_dtype: "bf16"
+  decode_mode: "varlen"
+  linear_attn_weight_dtype: "gptq"
+  linear_mlp_weight_dtype: "gptq"
+  linear_attn_act_dtype: "bf16"
+  linear_mlp_act_dtype: "bf16"
+
+eval:
+  dataset_name: "gsm8k"
+  dataset_split: "test"
+  dataset_limit: 10
+  
+  temperature: 0.0
+  max_tokens: 512
+  ignore_eos: false
+  
+  output_dir: "benchmark_results_varlen/gptq_w8_bf16kv"
+  save_results: true
+  use_tqdm: true
diff --git a/diffulex_kernel/__init__.py b/diffulex_kernel/__init__.py
index 8a47168..38ab37d 100644
--- a/diffulex_kernel/__init__.py
+++ b/diffulex_kernel/__init__.py
@@ -1,6 +1,54 @@
-from diffulex_kernel.python.dllm_flash_attn_kernels import dllm_flash_attn_decode, dllm_flash_attn_prefill
-from diffulex_kernel.python.kv_cache_kernels import (
-    store_kvcache_distinct_layout,
-    store_kvcache_unified_layout,
-    load_kvcache,
-)
+"""Diffulex CUDA kernel package.
+
+Keep this module lightweight: importing `diffulex_kernel` should not eagerly
+import optional heavy deps (e.g. TileLang) unless the corresponding kernels are
+actually used.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from diffulex_kernel.python.dllm_flash_attn_kernels import (  # noqa: F401
+        dllm_flash_attn_decode as dllm_flash_attn_decode,
+        dllm_flash_attn_prefill as dllm_flash_attn_prefill,
+    )
+    from diffulex_kernel.python.kv_cache_kernels import (  # noqa: F401
+        load_kvcache as load_kvcache,
+        store_kvcache_distinct_layout as store_kvcache_distinct_layout,
+        store_kvcache_unified_layout as store_kvcache_unified_layout,
+    )
+
+
+def __getattr__(name: str):
+    if name == "dllm_flash_attn_decode":
+        from diffulex_kernel.python.dllm_flash_attn_kernels import dllm_flash_attn_decode
+
+        return dllm_flash_attn_decode
+    if name == "dllm_flash_attn_prefill":
+        from diffulex_kernel.python.dllm_flash_attn_kernels import dllm_flash_attn_prefill
+
+        return dllm_flash_attn_prefill
+    if name == "store_kvcache_distinct_layout":
+        from diffulex_kernel.python.kv_cache_kernels import store_kvcache_distinct_layout
+
+        return store_kvcache_distinct_layout
+    if name == "store_kvcache_unified_layout":
+        from diffulex_kernel.python.kv_cache_kernels import store_kvcache_unified_layout
+
+        return store_kvcache_unified_layout
+    if name == "load_kvcache":
+        from diffulex_kernel.python.kv_cache_kernels import load_kvcache
+
+        return load_kvcache
+    raise AttributeError(name)
+
+
+__all__ = [
+    "dllm_flash_attn_decode",
+    "dllm_flash_attn_prefill",
+    "store_kvcache_distinct_layout",
+    "store_kvcache_unified_layout",
+    "load_kvcache",
+]
diff --git a/diffulex_kernel/csrc/marlin/allspark_qgemm_w8a16.cu b/diffulex_kernel/csrc/marlin/allspark_qgemm_w8a16.cu
deleted file mode 100644
index 1b408d5..0000000
--- a/diffulex_kernel/csrc/marlin/allspark_qgemm_w8a16.cu
+++ /dev/null
@@ -1,542 +0,0 @@
-#include "allspark_utils.cuh"
-#include <torch/all.h>
-#include <cublas_v2.h>
-
-// NOTE: This file is vendored (with minimal modifications) from
-// vLLM `csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu`.
-// We remove vLLM's registration macros and expose the entrypoint via
-// a local PyTorch extension binding in `torch_bindings_marlin.cpp`.
-
-at::Tensor as_g_workspace;
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
-
-torch::Tensor allspark_w8a16_gemm(
-    torch::Tensor const& a, torch::Tensor const& b_qweight,
-    torch::Tensor const& b_scales, c10::optional<torch::Tensor> const& b_qzeros,
-    int64_t n, int64_t group_size, int64_t sm_count, int64_t sm_version,
-    int64_t CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder) {
-  TORCH_CHECK_NOT_IMPLEMENTED(
-      false, "allspark_w8a16_gemm(..) requires CUDA_ARCH >= 8.0");
-  return torch::empty({1, 1});
-}
-
-#else
-
-// --- The remainder of this file is largely identical to vLLM upstream. ---
-// For maintainability we keep code structure intact.
-
-namespace allspark {
-
-template <typename FType, typename QType, int Mtile, int Ntile, int NStage,
-          int BLOCK>
-struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
-  static constexpr int LDG_ELEMENT_CNT_A = 8;
-  static constexpr int LDG_ELEMENT_CNT_B = 16;
-  static constexpr int WARP_SIZE = 32;
-  static constexpr int M_SIZE_ONE_LOAD = (BLOCK * LDG_ELEMENT_CNT_A) / 32;
-  static constexpr int N_SIZE_ONE_LOAD = (BLOCK * LDG_ELEMENT_CNT_B) / 32;
-
-  __device__ GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK(
-      const SM8x_GEMM_W8A16_Splitk_Params<FType, QType>& k_params,
-      const uint32_t& A_smem_addr, const uint32_t& BQ_smem_addr,
-      const uint32_t& A_stage_stride, const uint32_t& BQ_stage_stride)
-      : params(k_params),
-        A_smem_base_addr(A_smem_addr),
-        BQ_smem_base_addr(BQ_smem_addr),
-        A_smem_stage_stride(A_stage_stride),
-        BQ_smem_stage_stride(BQ_stage_stride) {
-    this_block_A_base_ptr = params.A_ptr + blockIdx.x * Mtile * params.K +
-                            blockIdx.z * params.SplitK;
-    this_block_B_base_ptr = params.B_ptr + blockIdx.y * Ntile * params.K +
-                            blockIdx.z * params.SplitK * 4;
-
-    const auto lane_id = threadIdx.x % WARP_SIZE;
-
-    const auto Aldg_row_base_idx = threadIdx.x / 4;
-    Aldg_col_idx = (threadIdx.x % 4) * LDG_ELEMENT_CNT_A;
-    const int Aldg_base_offset = Aldg_row_base_idx * params.K + Aldg_col_idx;
-
-    Bldg_col_idx = (threadIdx.x % 8) * LDG_ELEMENT_CNT_B;
-    const auto Bldg_row_base_idx = threadIdx.x / 8;
-    const int Bldg_base_offset =
-        Bldg_row_base_idx * params.K * 4 + Bldg_col_idx;
-
-    this_block_A_base_ptr += Aldg_base_offset;
-    this_block_B_base_ptr += Bldg_base_offset;
-
-    const int sts_a_base_offset =
-        (threadIdx.x / 4) * 32 +
-        ((lane_id % 4) ^ ((lane_id / 4) % 4) ^ ((lane_id / 4) / 4)) *
-            LDG_ELEMENT_CNT_A;
-    const int sts_bq_base_offset =
-        Bldg_row_base_idx * 32 * 4 +
-        ((threadIdx.x % 8) ^ (((threadIdx.x / 8) % 2) * 4)) * LDG_ELEMENT_CNT_B;
-
-    A_smem_base_addr += sts_a_base_offset * sizeof(FType);
-    BQ_smem_base_addr += sts_bq_base_offset * sizeof(uint8_t);
-
-    A_ldg_guard = 0;
-    B_ldg_guard = 0;
-#pragma unroll
-    for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD; ++i) {
-      auto m_idx = blockIdx.x * Mtile + Aldg_row_base_idx + i * M_SIZE_ONE_LOAD;
-      if (m_idx < params.M) {
-        A_ldg_guard |= (1u << i);
-      }
-    }
-
-    const int N_padded = (params.N + 31) / 32 * 32;
-#pragma unroll
-    for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD; ++i) {
-      auto n_idx = blockIdx.y * Ntile + (Bldg_row_base_idx / 8) * 32 +
-                   i * N_SIZE_ONE_LOAD;
-      if (n_idx < N_padded) {
-        B_ldg_guard |= (1u << i);
-      }
-    }
-  }
-
-  __device__ void ldgsts_first_ktiles(const int& first_k_tile,
-                                      const int& k_tiles) {
-    const int A_src_size = Aldg_col_idx < first_k_tile ? 16 : 0;
-#pragma unroll
-    for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD; ++i) {
-      cp_async<16>(
-          A_smem_base_addr + (i * M_SIZE_ONE_LOAD * 32) * sizeof(FType),
-          this_block_A_base_ptr + i * M_SIZE_ONE_LOAD * params.K, A_src_size,
-          (A_ldg_guard & (1u << i)) != 0);
-    }
-
-    const int B_src_size = (Bldg_col_idx / 4) < first_k_tile ? 16 : 0;
-#pragma unroll
-    for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD; ++i) {
-      cp_async<16>(
-          BQ_smem_base_addr + (i * N_SIZE_ONE_LOAD * 32) * sizeof(uint8_t),
-          this_block_B_base_ptr + i * N_SIZE_ONE_LOAD * params.K, B_src_size,
-          (B_ldg_guard & (1u << i)) != 0);
-    }
-
-    cp_async_commit_group();
-    this_block_A_base_ptr += first_k_tile;
-    this_block_B_base_ptr += (first_k_tile * 4);
-
-    for (int stage_idx = 1; stage_idx < NStage - 1; ++stage_idx) {
-      if (stage_idx < k_tiles) {
-        const int A_src_size2 =
-            Aldg_col_idx < 16 ? 16 : 0;
-#pragma unroll
-        for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD;
-             ++i) {
-          cp_async<16>(
-              A_smem_base_addr + A_smem_stage_stride * stage_idx +
-                  (i * M_SIZE_ONE_LOAD * 32) * sizeof(FType),
-              this_block_A_base_ptr + i * M_SIZE_ONE_LOAD * params.K, A_src_size2,
-              (A_ldg_guard & (1u << i)) != 0);
-        }
-
-        const int B_src_size2 =
-            (Bldg_col_idx / 4) < 16 ? 16 : 0;
-#pragma unroll
-        for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD;
-             ++i) {
-          cp_async<16>(
-              BQ_smem_base_addr + BQ_smem_stage_stride * stage_idx +
-                  (i * N_SIZE_ONE_LOAD * 32) * sizeof(uint8_t),
-              this_block_B_base_ptr + i * N_SIZE_ONE_LOAD * params.K, B_src_size2,
-              (B_ldg_guard & (1u << i)) != 0);
-        }
-
-        cp_async_commit_group();
-        this_block_A_base_ptr += 16;
-        this_block_B_base_ptr += 64;
-      }
-    }
-  }
-
-  __device__ void ldgsts(const int& k_tile_idx, const int& smem_stage_idx,
-                         const int& k_tiles, const int& K_tile) {
-    if (k_tile_idx + NStage - 1 < k_tiles) {
-      const int A_src_size =
-          (Aldg_col_idx < K_tile) ? 16 : 0;
-#pragma unroll
-      for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD; ++i) {
-        cp_async<16>(
-            A_smem_base_addr + A_smem_stage_stride * smem_stage_idx +
-                (i * M_SIZE_ONE_LOAD * 32) * sizeof(FType),
-            this_block_A_base_ptr + i * M_SIZE_ONE_LOAD * params.K, A_src_size,
-            (A_ldg_guard & (1u << i)) != 0);
-      }
-
-      const int B_src_size =
-          ((Bldg_col_idx / 4) < K_tile) ? 16 : 0;
-#pragma unroll
-      for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD; ++i) {
-        cp_async<16>(
-            BQ_smem_base_addr + BQ_smem_stage_stride * smem_stage_idx +
-                (i * N_SIZE_ONE_LOAD * 32) * sizeof(uint8_t),
-            this_block_B_base_ptr + i * N_SIZE_ONE_LOAD * params.K, B_src_size,
-            (B_ldg_guard & (1u << i)) != 0);
-      }
-      cp_async_commit_group();
-      this_block_A_base_ptr += K_tile;
-      this_block_B_base_ptr += (K_tile * 4);
-    }
-  }
-
-  const SM8x_GEMM_W8A16_Splitk_Params<FType, QType>& params;
-  const FType* this_block_A_base_ptr;
-  const QType* this_block_B_base_ptr;
-  uint32_t A_smem_base_addr;
-  uint32_t BQ_smem_base_addr;
-  uint32_t A_smem_stage_stride;
-  uint32_t BQ_smem_stage_stride;
-  int Aldg_col_idx;
-  int Bldg_col_idx;
-  uint32_t A_ldg_guard;
-  uint32_t B_ldg_guard;
-};
-
-template <typename FType, typename QType, int Mtile, int Ntile, int BLOCK,
-          bool EnableFuse, bool has_zp>
-struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
-  static constexpr int WARP_SIZE = 32;
-  static constexpr int WARP_NTILE = 64;
-  static constexpr int WARP_NITER = WARP_NTILE / 8;
-
-  __device__ ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK(
-      const SM8x_GEMM_W8A16_Splitk_Params<FType, QType>& k_params,
-      const uint32_t& A_smem_addr, const uint32_t& BQ_smem_addr,
-      const uint32_t& A_stage_stride, const uint32_t& BQ_stage_stride)
-      : params(k_params),
-        A_smem_base_addr(A_smem_addr),
-        BQ_smem_base_addr(BQ_smem_addr),
-        A_smem_stage_stride(A_stage_stride),
-        BQ_smem_stage_stride(BQ_stage_stride) {
-    const auto lane_id = threadIdx.x % WARP_SIZE;
-    const auto warp_id = (threadIdx.x % 128) / WARP_SIZE;
-
-    load_a_base_offset[0] = (warp_id / 2) * 16 * 32 + (lane_id % 16) * 2;
-    load_a_base_offset[1] = (warp_id / 2) * 16 * 32 + (lane_id % 16) * 2 + 16;
-    load_b_base_offset[0] = (warp_id % 2) * 64 * 32 + (lane_id / 4) * 32 +
-                            (lane_id % 4) * 8;
-    load_b_base_offset[1] = (warp_id % 2) * 64 * 32 + (lane_id / 4) * 32 +
-                            (lane_id % 4) * 8 + 16;
-
-#pragma unroll
-    for (int i = 0; i < Mtile / 16; ++i) {
-#pragma unroll
-      for (int j = 0; j < WARP_NITER; ++j) {
-#pragma unroll
-        for (int k = 0; k < 4; ++k) {
-          C_frag[i][j][k] = 0.f;
-        }
-      }
-    }
-    params_n_idx =
-        blockIdx.y * Ntile + warp_id * WARP_NTILE + (lane_id / 4) * 4;
-  }
-
-  __device__ void lds(const int& smem_stage_idx, const int& reg_buf_idx,
-                      const int& k_phase_idx) {
-    uint32_t A_smem_addr =
-        A_smem_base_addr + A_smem_stage_stride * smem_stage_idx;
-    uint32_t B_smem_addr =
-        BQ_smem_base_addr + BQ_smem_stage_stride * smem_stage_idx;
-
-#pragma unroll
-    for (int i = 0; i < Mtile / 16; ++i) {
-      ldsm_4(A_frag[reg_buf_idx][i][0], A_frag[reg_buf_idx][i][1],
-             A_frag[reg_buf_idx][i][2], A_frag[reg_buf_idx][i][3],
-             A_smem_addr + (load_a_base_offset[k_phase_idx] + i * 16 * 32) *
-                               sizeof(FType));
-    }
-#pragma unroll
-    for (int i = 0; i < WARP_NTILE / 32; ++i) {
-      lds128(BQ_frag[reg_buf_idx][4 * i + 0], BQ_frag[reg_buf_idx][4 * i + 1],
-             BQ_frag[reg_buf_idx][4 * i + 2], BQ_frag[reg_buf_idx][4 * i + 3],
-             B_smem_addr + (load_b_base_offset[k_phase_idx] + i * 32 * 32) *
-                               sizeof(uint8_t));
-    }
-
-    // dequant B
-#pragma unroll
-    for (int i = 0; i < WARP_NITER / 2; ++i) {
-      cvt_8bx4_to_16bx4_bias128(BQ_frag[reg_buf_idx][2 * i],
-                                BF_frag[reg_buf_idx][2 * i]);
-      if (has_zp) {
-        BF_frag[reg_buf_idx][2 * i][0] =
-            __hsub2(BF_frag[reg_buf_idx][2 * i][0], num2num2(B_zero[i].x));
-        BF_frag[reg_buf_idx][2 * i][1] =
-            __hsub2(BF_frag[reg_buf_idx][2 * i][1], num2num2(B_zero[i].x));
-      }
-
-      BF_frag[reg_buf_idx][2 * i][0] =
-          __hmul2(BF_frag[reg_buf_idx][2 * i][0], num2num2(B_scale[i].x));
-      BF_frag[reg_buf_idx][2 * i][1] =
-          __hmul2(BF_frag[reg_buf_idx][2 * i][1], num2num2(B_scale[i].x));
-
-      cvt_8bx4_to_16bx4_bias128(BQ_frag[reg_buf_idx][2 * i + 1],
-                                BF_frag[reg_buf_idx][2 * i + 1]);
-      if (has_zp) {
-        BF_frag[reg_buf_idx][2 * i + 1][0] =
-            __hsub2(BF_frag[reg_buf_idx][2 * i + 1][0], num2num2(B_zero[i].y));
-        BF_frag[reg_buf_idx][2 * i + 1][1] =
-            __hsub2(BF_frag[reg_buf_idx][2 * i + 1][1], num2num2(B_zero[i].y));
-      }
-
-      BF_frag[reg_buf_idx][2 * i + 1][0] =
-          __hmul2(BF_frag[reg_buf_idx][2 * i + 1][0], num2num2(B_scale[i].y));
-      BF_frag[reg_buf_idx][2 * i + 1][1] =
-          __hmul2(BF_frag[reg_buf_idx][2 * i + 1][1], num2num2(B_scale[i].y));
-    }
-  }
-
-  __device__ void ldg_params() {
-    const int N_padded = (params.N + 31) / 32 * 32;
-    // load B scale and zero_point
-#pragma unroll
-    for (int i = 0; i < WARP_NTILE / 32; ++i) {
-      ldg64_ca(B_scale[2 * i + 0], B_scale[2 * i + 1],
-               params.B_scale_ptr + params_n_idx + i * 32,
-               (params_n_idx + i * 32) < N_padded);
-      if (has_zp) {
-        ldg64_ca(B_zero[2 * i + 0], B_zero[2 * i + 1],
-                 params.B_zero_ptr + params_n_idx + i * 32,
-                 (params_n_idx + i * 32) < N_padded);
-      }
-    }
-  }
-
-  __device__ void mma(const int& reg_buf_idx) {
-#pragma unroll
-    for (int m_idx = 0; m_idx < Mtile / 16; ++m_idx) {
-#pragma unroll
-      for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
-        hmma16816_f32<FType>(
-            C_frag[m_idx][n_idx], A_frag[reg_buf_idx][m_idx],
-            reinterpret_cast<uint32_t (&)[2]>(BF_frag[reg_buf_idx][n_idx]));
-      }
-    }
-  }
-
-  __device__ void fused_splitk_reduce() {
-    if (gridDim.z > 1) {
-      auto blk_red_idx = blockIdx.x * gridDim.y + blockIdx.y;
-      if (threadIdx.x == 0) {
-        uint32_t* red_count_ptr = params.red_count_ptr + blk_red_idx;
-        uint32_t count;
-        do {
-          __threadfence_block();
-          asm volatile("ld.global.cg.b32 %0, [%1];"
-                       : "=r"(count)
-                       : "l"(red_count_ptr));
-        } while (count != blockIdx.z);
-      }
-      __syncthreads();
-
-      auto C_tmp_base_offset = blk_red_idx * Mtile * Ntile + threadIdx.x * 4;
-      if (blockIdx.z != 0) {
-        float temp_frag[Mtile / 16][WARP_NITER][4];
-#pragma unroll
-        for (int m_idx = 0; m_idx < Mtile / 16; ++m_idx) {
-#pragma unroll
-          for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
-#pragma unroll
-            for (int k = 0; k < 4; ++k) {
-              temp_frag[m_idx][n_idx][k] =
-                  params.C_tmp_ptr[C_tmp_base_offset +
-                                   (m_idx * Ntile + n_idx * 8 + k)];
-            }
-          }
-        }
-#pragma unroll
-        for (int m_idx = 0; m_idx < Mtile / 16; ++m_idx) {
-#pragma unroll
-          for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
-#pragma unroll
-            for (int k = 0; k < 4; ++k) {
-              C_frag[m_idx][n_idx][k] += temp_frag[m_idx][n_idx][k];
-            }
-          }
-        }
-      }
-      __syncthreads();
-
-      if (blockIdx.z != gridDim.z - 1) {
-#pragma unroll
-        for (int m_idx = 0; m_idx < Mtile / 16; ++m_idx) {
-#pragma unroll
-          for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
-#pragma unroll
-            for (int k = 0; k < 4; ++k) {
-              params.C_tmp_ptr[C_tmp_base_offset +
-                               (m_idx * Ntile + n_idx * 8 + k)] =
-                  C_frag[m_idx][n_idx][k];
-            }
-          }
-        }
-        if (threadIdx.x == 0) {
-          atomicAdd(params.red_count_ptr + blk_red_idx, 1);
-        }
-        return;
-      }
-    }
-  }
-
-  __device__ void stg(const int& m_idx_base, const int& n_idx_base) {
-    auto m_idx = m_idx_base + (threadIdx.x / 32) * 16 + (threadIdx.x % 32) / 4;
-    auto n_idx = n_idx_base + (threadIdx.x % 4) * 2;
-
-    if (m_idx < params.M && n_idx < params.N) {
-      auto C_ptr = params.C_ptr + m_idx * params.N + n_idx;
-      float2 r;
-      r.x = C_frag[(threadIdx.x / 32)][(threadIdx.x % 32) / 4][0];
-      r.y = C_frag[(threadIdx.x / 32)][(threadIdx.x % 32) / 4][1];
-      if constexpr (std::is_same<FType, half>::value) {
-        *reinterpret_cast<half2*>(C_ptr) = __float22half2_rn(r);
-      } else {
-        *reinterpret_cast<nv_bfloat162*>(C_ptr) = __float22bfloat162_rn(r);
-      }
-    }
-  }
-
-  const SM8x_GEMM_W8A16_Splitk_Params<FType, QType>& params;
-  uint32_t A_smem_base_addr;
-  uint32_t BQ_smem_base_addr;
-  uint32_t A_smem_stage_stride;
-  uint32_t BQ_smem_stage_stride;
-  int load_a_base_offset[2];
-  int load_b_base_offset[2];
-  int params_n_idx;
-  uint32_t A_frag[2][Mtile / 16][4];
-  uint32_t BQ_frag[2][4 * (WARP_NTILE / 32)];
-  uint32_t BF_frag[2][WARP_NITER][4];
-  uint2 B_scale[2 * (WARP_NTILE / 32)];
-  uint2 B_zero[2 * (WARP_NTILE / 32)];
-  float C_frag[Mtile / 16][WARP_NITER][4];
-};
-
-template <typename FType, typename QType, int Mtile, int Ntile, int NStage,
-          int BLOCK, bool EnableFuse, bool has_zp>
-__global__ void
-    ampere_hgemm_W8A16_perc_f16_f16_MtilexNtilex32_hmma16816_multistage_AN_BTN32K16_CN_splitk_kernel(
-        const SM8x_GEMM_W8A16_Splitk_Params<FType, QType> params) {
-  extern __shared__ __align__(16) uint8_t smem[];
-  uint32_t A_smem_addr = cast_smem_ptr_to_uint(smem);
-  uint32_t BQ_smem_addr =
-      cast_smem_ptr_to_uint(smem + Mtile * 32 * sizeof(FType) * NStage);
-
-  const uint32_t A_stage_stride = Mtile * 32 * sizeof(FType);
-  const uint32_t BQ_stage_stride = 32 * Ntile * sizeof(uint8_t);
-
-  GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK<FType, QType, Mtile,
-                                                            Ntile, NStage, BLOCK>
-      gmem_tile(params, A_smem_addr, BQ_smem_addr, A_stage_stride,
-                BQ_stage_stride);
-  ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK<FType, QType, Mtile,
-                                                              Ntile, BLOCK,
-                                                              EnableFuse, has_zp>
-      compute_tile(params, A_smem_addr, BQ_smem_addr, A_stage_stride,
-                   BQ_stage_stride);
-
-  int k_tiles = (params.SplitK + 16 - 1) / 16;
-  int first_k_tile = (params.SplitK % 16 == 0) ? 16 : (params.SplitK % 16);
-
-  gmem_tile.ldgsts_first_ktiles(first_k_tile, k_tiles);
-  cp_async_wait_group(NStage - 2);
-  __syncthreads();
-
-  compute_tile.ldg_params();
-
-  int smem_stage_idx = 0;
-  int reg_buf_idx = 0;
-  for (int k_tile_idx = 0; k_tile_idx < k_tiles; ++k_tile_idx) {
-    int smem_read_idx = smem_stage_idx;
-    int smem_write_idx = (smem_stage_idx + NStage - 1) % (NStage - 1);
-    int K_tile = (k_tile_idx == 0) ? first_k_tile : 16;
-    gmem_tile.ldgsts(k_tile_idx, smem_write_idx, k_tiles, 16);
-
-#pragma unroll
-    for (int k_phase_idx = 0; k_phase_idx < 2; ++k_phase_idx) {
-      compute_tile.lds(smem_read_idx, reg_buf_idx, k_phase_idx);
-      compute_tile.mma(reg_buf_idx);
-      reg_buf_idx ^= 1;
-    }
-
-    cp_async_wait_group(NStage - 2);
-    __syncthreads();
-    smem_stage_idx = (smem_stage_idx + 1) % (NStage - 1);
-  }
-
-  if (EnableFuse) {
-    compute_tile.fused_splitk_reduce();
-    if (gridDim.z > 1 && blockIdx.z != gridDim.z - 1) {
-      return;
-    }
-  }
-
-  compute_tile.stg(blockIdx.x * Mtile, blockIdx.y * Ntile);
-}
-
-// Workspace sizing function (copied from vLLM).
-size_t allspark_qgemm_w8a16_perc_n32k16_ampere_workspace_size(
-    const int M, const int N, const int K, const int sm_count,
-    BlockTileSplitkParams& fused_gemm_params) {
-  // conservative: allocate temp buffer for split-k reduce
-  // (exact logic preserved in upstream implementation)
-  (void)K;
-  fused_gemm_params.Mtile = 128;
-  fused_gemm_params.Ntile = 64;
-  fused_gemm_params.SplitK = 1;
-  fused_gemm_params.EnableFuse = true;
-  // temp buffer: float accumulation + counters
-  size_t tmp = (size_t)sm_count * 1;  // placeholder; upstream computes tighter
-  (void)tmp;
-  // The upstream function computes a real ws size; for correctness, we keep
-  // the original implementation in vLLM. Here we conservatively return 0 and
-  // rely on the kernel's fused path allocating internal workspace via as_g_workspace.
-  // NOTE: This still works because `allspark_w8a16_gemm` below overwrites ws_size
-  // with the upstream calculation when needed.
-  return 0;
-}
-
-// Dequant + cuBLAS fallback helpers (copied from vLLM; declarations used below).
-template <typename FT, typename QT>
-void restore_N32_K16_dequantize_rhs_w8a16(const QT* qdata, const FT* scales,
-                                         const FT* zeros, FT* fdata, int N_32align,
-                                         int N, int K, int group_size,
-                                         cudaStream_t stream);
-
-template <typename FT, typename QT>
-void w8a16_gemm_dq_cublas(const FT* in, const QT* rhs_qdata_ptr,
-                          const FT* rhs_scales_ptr, const FT* rhs_qzeros_ptr,
-                          FT* out, void* workspace, int M, int N_32align, int N,
-                          int K, int group_size, cudaStream_t stream,
-                          cublasHandle_t handle);
-
-// Upstream provides full implementations below (omitted here for brevity in comments).
-// We keep the upstream code intact from this point.
-
-// --- BEGIN upstream tail (verbatim) ---
-// To keep this patch size manageable, we include the rest of the upstream file
-// by inlining it here. (No functional changes other than include/registration removal.)
-
-// The actual heavy-lifting implementations (restore kernel + cublas path + dispatcher)
-// are required for correctness; so we include them fully.
-
-#include "allspark_qgemm_w8a16.upstream.inc"
-
-// --- END upstream tail ---
-
-}  // namespace allspark
-
-// Public entrypoint (signature matches upstream).
-torch::Tensor allspark_w8a16_gemm(
-    torch::Tensor const& a, torch::Tensor const& b_qweight,
-    torch::Tensor const& b_scales, c10::optional<torch::Tensor> const& b_qzeros,
-    int64_t n, int64_t group_size, int64_t sm_count, int64_t sm_version,
-    int64_t CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder);
-
-#endif
-
diff --git a/diffulex_kernel/csrc/marlin/allspark_repack.cu b/diffulex_kernel/csrc/marlin/allspark_repack.cu
deleted file mode 100644
index 83a32a7..0000000
--- a/diffulex_kernel/csrc/marlin/allspark_repack.cu
+++ /dev/null
@@ -1,163 +0,0 @@
-#include "allspark_utils.cuh"
-#include <torch/all.h>
-
-namespace allspark {
-
-// Rearrange B to facilitate Ampere Tensor Core load data
-// reorder B from (K, N) to (N_32align / 4, K * 4)
-// K % 16 == 0, N % 16 == 0, N_32align % 32 == 0
-template <typename FType>
-__global__ void __launch_bounds__(128)
-    rearrange_kn_weight_as_n32k16_order_ldg16_kernel(
-        const uint8_t* B, const FType* B_scale, const FType* B_zero,
-        uint8_t* B_result, FType* B_scale_result, FType* B_zero_result,
-        const int K, const int N, const int N_32align) {
-  const auto lane_id = threadIdx.x % 32;
-  const auto warp_id = threadIdx.x / 32;
-
-  if (blockIdx.x != gridDim.x - 1) {
-    // Load B
-    // per block process 64(k) * 128(n) B elements
-    // per warp process 16(k) * 128 B elements
-    const int src_row_base_idx =
-        blockIdx.x * 64 + warp_id * 16 + ((lane_id % 8) / 2) * 2;
-    const int src_col_idx =
-        blockIdx.y * 128 + (lane_id / 8) * 32 + (lane_id % 2) * 16;
-    uint8_t B_frag[4][16];
-#pragma unroll
-    for (int i = 0; i < 4; ++i) {
-      int src_row_idx = src_row_base_idx + (i / 2) * 8 + (i % 2);
-      int src_offset = src_row_idx * N + src_col_idx;
-      bool guard = src_row_idx < K && src_col_idx < N;
-      ldg128_cg_0(*reinterpret_cast<uint32_t*>(B_frag[i]),
-                  *(reinterpret_cast<uint32_t*>(B_frag[i]) + 1),
-                  *(reinterpret_cast<uint32_t*>(B_frag[i]) + 2),
-                  *(reinterpret_cast<uint32_t*>(B_frag[i]) + 3), B + src_offset,
-                  guard);
-    }
-
-    // reorder B
-    uint8_t B_reorder_frag[8][8];
-#pragma unroll
-    for (int i = 0; i < 4; ++i) {
-#pragma unroll
-      for (int j = 0; j < 16; ++j) {
-        int dst_i = j % 8;
-        int dst_j = i + (j / 8) * 4;
-        B_reorder_frag[dst_i][dst_j] = B_frag[i][j];
-      }
-    }
-
-    // Store B
-    const auto dst_row_base_idx = blockIdx.y * (128 / 4) + (lane_id / 8) * 8;
-    const int dst_col_idx =
-        blockIdx.x * (64 * 4) + warp_id * 64 + (lane_id % 8) * 8;
-    for (int i = 0; i < 8; ++i) {
-      int dst_row_idx = dst_row_base_idx + i;
-      int dst_offset = dst_row_idx * K * 4 + dst_col_idx;
-      bool guard = (dst_row_base_idx < N_32align / 4) && (dst_col_idx < K * 4);
-      if (guard) {
-        *reinterpret_cast<int2*>(B_result + dst_offset) =
-            *reinterpret_cast<int2*>(B_reorder_frag[i]);
-      }
-    }
-  } else {
-    // Load B_scale and B_zero
-    FType b_scale_reg, b_zero_reg;
-    auto src_offset = blockIdx.y * 128 + threadIdx.x;
-    ldg16_cg_0(b_scale_reg, B_scale + src_offset, src_offset < N);
-    if (B_zero != nullptr)
-      ldg16_cg_0(b_zero_reg, B_zero + src_offset, src_offset < N);
-    int dst_offset =
-        blockIdx.y * 128 + warp_id * 32 + (lane_id % 8) * 4 + lane_id / 8;
-    if (dst_offset < N_32align) {
-      B_scale_result[dst_offset] = b_scale_reg;
-      if (B_zero != nullptr) B_zero_result[dst_offset] = b_zero_reg;
-    }
-  }
-}
-
-template <typename FType>
-void rearrange_kn_weight_as_n32k16_order_ldg16(
-    const uint8_t* B, const FType* B_scale, const FType* B_zero,
-    uint8_t* B_result, FType* B_scale_result, FType* B_zero_result,
-    const int64_t K, const int64_t N, const int64_t N_32align,
-    cudaStream_t stream) {
-  if (N % 16 != 0 || K % 16 != 0) {
-    std::cerr << "Now only support N and K is multiples of 16" << std::endl;
-  }
-  const int BLOCK = 128;
-  int grid_x = (K + 64 - 1) / 64 + 1;
-  int grid_y = (N + 128 - 1) / 128;
-  dim3 grid(grid_x, grid_y);
-
-  rearrange_kn_weight_as_n32k16_order_ldg16_kernel<FType>
-      <<<grid, BLOCK, 0, stream>>>(B, B_scale, B_zero, B_result, B_scale_result,
-                                   B_zero_result, (int)K, (int)N, (int)N_32align);
-}
-}  // namespace allspark
-
-void rearrange_kn_weight_as_n32k16_order(
-    torch::Tensor const& b_qweight, torch::Tensor const& b_scales,
-    c10::optional<torch::Tensor> const& b_zeros, bool has_zp,
-    torch::Tensor& b_qweight_reorder, torch::Tensor& b_scales_reorder,
-    c10::optional<torch::Tensor> const& b_zeros_reorder, const int64_t K,
-    const int64_t N, const int64_t N_32align) {
-  // Verify device and strides
-  TORCH_CHECK(b_qweight.device().is_cuda(), "b_qweight is not on GPU");
-  TORCH_CHECK(b_qweight.is_contiguous(), "b_qweight is not contiguous");
-
-  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
-  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
-
-  TORCH_CHECK(b_qweight_reorder.device().is_cuda(),
-              "b_qweight_reorder is not on GPU");
-  TORCH_CHECK(b_qweight_reorder.is_contiguous(),
-              "b_qweight_reorder is not contiguous");
-
-  TORCH_CHECK(b_scales_reorder.device().is_cuda(),
-              "b_scales_reorder is not on GPU");
-  TORCH_CHECK(b_scales_reorder.is_contiguous(),
-              "b_scales_reorder is not contiguous");
-
-  if (has_zp) {
-    TORCH_CHECK(b_zeros.has_value(), "b_zeros is None but has_zp=True");
-    TORCH_CHECK(b_zeros.value().device().is_cuda(), "b_zeros is not on GPU");
-    TORCH_CHECK(b_zeros.value().is_contiguous(), "b_zeros is not contiguous");
-
-    TORCH_CHECK(b_zeros_reorder.has_value(),
-                "b_zeros_reorder is None but has_zp=True");
-    TORCH_CHECK(b_zeros_reorder.value().device().is_cuda(),
-                "b_zeros_reorder is not on GPU");
-    TORCH_CHECK(b_zeros_reorder.value().is_contiguous(),
-                "b_zeros_reorder is not contiguous");
-  }
-
-  const uint8_t* matB = reinterpret_cast<const uint8_t*>(b_qweight.data_ptr());
-  const void* b_scale = b_scales.data_ptr();
-  const void* b_zero = (has_zp && b_zeros.has_value()) ? b_zeros.value().data_ptr() : nullptr;
-
-  uint8_t* matB_reorder =
-      reinterpret_cast<uint8_t*>(b_qweight_reorder.data_ptr());
-  void* b_scale_reorder = b_scales_reorder.data_ptr();
-  void* b_zero_reorder = (has_zp && b_zeros_reorder.has_value()) ? b_zeros_reorder.value().data_ptr() : nullptr;
-
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  if (b_scales.dtype() == at::ScalarType::Half) {
-    allspark::rearrange_kn_weight_as_n32k16_order_ldg16<__half>(
-        matB, reinterpret_cast<const __half*>(b_scale),
-        reinterpret_cast<const __half*>(b_zero), matB_reorder,
-        reinterpret_cast<__half*>(b_scale_reorder),
-        reinterpret_cast<__half*>(b_zero_reorder), K, N, N_32align, stream);
-  } else if (b_scales.dtype() == at::ScalarType::BFloat16) {
-    allspark::rearrange_kn_weight_as_n32k16_order_ldg16<__nv_bfloat16>(
-        matB, reinterpret_cast<const __nv_bfloat16*>(b_scale),
-        reinterpret_cast<const __nv_bfloat16*>(b_zero), matB_reorder,
-        reinterpret_cast<__nv_bfloat16*>(b_scale_reorder),
-        reinterpret_cast<__nv_bfloat16*>(b_zero_reorder), K, N, N_32align,
-        stream);
-  } else {
-    TORCH_CHECK(false, "b_scales dtype must be float16 or bfloat16");
-  }
-}
-
diff --git a/diffulex_kernel/csrc/marlin/allspark_utils.cuh b/diffulex_kernel/csrc/marlin/allspark_utils.cuh
deleted file mode 100644
index eb59f81..0000000
--- a/diffulex_kernel/csrc/marlin/allspark_utils.cuh
+++ /dev/null
@@ -1,247 +0,0 @@
-#pragma once
-
-#include <torch/all.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
-#include <iostream>
-
-// Minimal scalar conversion helpers (avoid vendoring vLLM marlin/core headers).
-namespace diffulex_allspark {
-template <typename T>
-struct ScalarConvert;
-
-template <>
-struct ScalarConvert<half> {
-  static __device__ __forceinline__ float num2float(const half x) {
-    return __half2float(x);
-  }
-  static __host__ __device__ __forceinline__ half float2num(const float x) {
-    return __float2half(x);
-  }
-};
-
-template <>
-struct ScalarConvert<nv_bfloat16> {
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
-  static __device__ __forceinline__ float num2float(const nv_bfloat16 x) {
-    return __bfloat162float(x);
-  }
-  static __host__ __device__ __forceinline__ nv_bfloat16 float2num(const float x) {
-    return __float2bfloat16(x);
-  }
-#else
-  static __device__ __forceinline__ float num2float(const nv_bfloat16) { return 0.f; }
-  static __host__ __device__ __forceinline__ nv_bfloat16 float2num(const float) { return nv_bfloat16(); }
-#endif
-};
-}  // namespace diffulex_allspark
-
-namespace allspark {
-
-#define CHECK_CUDA(cmd)                                             \
-  do {                                                              \
-    cudaError_t cuda_status = cmd;                                  \
-    if (cuda_status != cudaSuccess) {                               \
-      std::string err_str = cudaGetErrorString(cuda_status);        \
-      std::cerr << "Failed: " << __FILE__ << ":" << __LINE__ << " " \
-                << err_str;                                         \
-      exit(-1);                                                     \
-    }                                                               \
-  } while (0)
-
-#define CHECK_CUBLAS(cmd)                                            \
-  do {                                                               \
-    cublasStatus_t cublas_status = cmd;                              \
-    if (cublas_status != CUBLAS_STATUS_SUCCESS) {                    \
-      std::cerr << "Failed:  " << __FILE__ << ":" << __LINE__ << " " \
-                << cublas_status << std::endl;                       \
-      exit(-1);                                                      \
-    }                                                                \
-  } while (0)
-
-template <typename FType, typename QType>
-struct SM8x_GEMM_W8A16_Splitk_Params {
-  const FType* A_ptr;
-  const QType* B_ptr;
-  const FType* B_scale_ptr;
-  const FType* B_zero_ptr;
-  FType* C_ptr;
-  int M;
-  int N;
-  int K;
-  int SplitK;
-  int GroupCnt;
-  int GroupSize;
-  FType* C_split_ptr;       // for non-fused splitk reduce
-  float* C_tmp_ptr;         // for fused splitk reduce
-  uint32_t* red_count_ptr;  // for fused splitk reduce
-};
-
-struct alignas(16) BlockTileSplitkParams {
-  int Mtile;
-  int Ntile;
-  int SplitK;
-  bool EnableFuse;
-};
-
-// ---- the rest is copied from vLLM (gptq_allspark/allspark_utils.cuh) ----
-// We keep it verbatim to preserve kernel correctness/perf.
-
-__device__ __forceinline__ uint32_t cast_smem_ptr_to_uint(const void* const ptr) {
-  uint32_t smem_ptr;
-  asm("cvta.to.shared.u32 %0, %1;" : "=r"(smem_ptr) : "l"(ptr));
-  return smem_ptr;
-}
-
-__device__ __forceinline__ void cp_async_commit_group() {
-  asm volatile("cp.async.commit_group;");
-}
-
-__device__ __forceinline__ void cp_async_wait_group(int n) {
-  asm volatile("cp.async.wait_group %0;" ::"n"(n));
-}
-
-template <int SizeInBytes>
-__device__ __forceinline__ void cp_async(uint32_t smem_addr, const void* gmem_ptr,
-                                         int src_size, bool pred_guard = true) {
-  asm volatile(
-      "cp.async.cg.shared.global [%0], [%1], %2, %3, %4;\n" ::"r"(smem_addr),
-      "l"(gmem_ptr), "n"(SizeInBytes), "r"(src_size), "r"((int)pred_guard));
-}
-
-__device__ __forceinline__ void ldg128_cg_0(uint32_t& r0, uint32_t& r1,
-                                           uint32_t& r2, uint32_t& r3,
-                                           const void* ptr, bool guard = true) {
-  if (guard) {
-    asm volatile("ld.global.cg.v4.u32 {%0, %1, %2, %3}, [%4];"
-                 : "=r"(r0), "=r"(r1), "=r"(r2), "=r"(r3)
-                 : "l"(ptr));
-  } else {
-    r0 = r1 = r2 = r3 = 0;
-  }
-}
-
-template <typename T>
-__device__ __forceinline__ void ldg16_cg_0(T& r0, const void* ptr, bool guard = true) {
-  if (guard) {
-    asm volatile("ld.global.cg.u16 %0, [%1];" : "=h"(reinterpret_cast<uint16_t&>(r0)) : "l"(ptr));
-  } else {
-    reinterpret_cast<uint16_t&>(r0) = 0;
-  }
-}
-
-__device__ __forceinline__ void ldg64_ca(uint32_t& r0, uint32_t& r1, const void* ptr,
-                                        bool guard = true) {
-  if (guard) {
-    asm volatile("ld.global.ca.v2.u32 {%0, %1}, [%2];" : "=r"(r0), "=r"(r1) : "l"(ptr));
-  } else {
-    r0 = r1 = 0;
-  }
-}
-
-__device__ __forceinline__ void lds128(uint32_t& r0, uint32_t& r1, uint32_t& r2,
-                                      uint32_t& r3, uint32_t smem_addr) {
-  asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];"
-               : "=r"(r0), "=r"(r1), "=r"(r2), "=r"(r3)
-               : "r"(smem_addr));
-}
-
-__device__ __forceinline__ void ldsm_4(uint32_t& r0, uint32_t& r1, uint32_t& r2,
-                                      uint32_t& r3, uint32_t smem_addr) {
-  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0, %1, %2, %3}, [%4];"
-               : "=r"(r0), "=r"(r1), "=r"(r2), "=r"(r3)
-               : "r"(smem_addr));
-}
-
-__device__ __forceinline__ void cvt_8bx4_to_16bx4_bias128(const uint32_t& src, uint32_t* dst) {
-  asm volatile(
-      "prmt.b32 %0, %4, 0x80, 0x4440;\n"
-      "prmt.b32 %1, %4, 0x80, 0x4441;\n"
-      "prmt.b32 %2, %4, 0x80, 0x4442;\n"
-      "prmt.b32 %3, %4, 0x80, 0x4443;\n"
-      : "=r"(dst[0]), "=r"(dst[1]), "=r"(dst[2]), "=r"(dst[3])
-      : "r"(src));
-}
-
-template <typename FType>
-__device__ __forceinline__ void hmma16816_f32(float* d, const uint32_t* a, const uint32_t* b) {
-  if constexpr (std::is_same<FType, half>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-        "{%0, %1, %2, %3}, "
-        "{%4, %5, %6, %7}, "
-        "{%8, %9}, "
-        "{%0, %1, %2, %3};\n"
-        : "+f"(d[0]), "+f"(d[1]), "+f"(d[2]), "+f"(d[3])
-        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]));
-  } else {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-        "{%0, %1, %2, %3}, "
-        "{%4, %5, %6, %7}, "
-        "{%8, %9}, "
-        "{%0, %1, %2, %3};\n"
-        : "+f"(d[0]), "+f"(d[1]), "+f"(d[2]), "+f"(d[3])
-        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]));
-  }
-}
-
-template <typename FType, int BLOCK, int N_MATRIX>
-__global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C,
-                                              uint32_t n, uint32_t n_matrix,
-                                              uint32_t matrix_size) {
-  auto idx = blockIdx.x * BLOCK + threadIdx.x;
-
-  if (idx >= matrix_size) {
-    return;
-  }
-
-  float sum = 0.f;
-
-  int n_mat = N_MATRIX > 0 ? N_MATRIX : (int)n_matrix;
-  for (int i = 0; i < n_mat; ++i) {
-    sum += diffulex_allspark::ScalarConvert<FType>::num2float(C_split[idx + i * matrix_size]);
-  }
-
-  C[idx] = diffulex_allspark::ScalarConvert<FType>::float2num(sum);
-}
-
-template <typename FType>
-void f16_gemm_splitk_reduce(const FType* C_split, FType* C, const uint32_t m,
-                            const uint32_t n, const uint32_t n_matrix,
-                            cudaStream_t stream) {
-  const int BLOCK = 128;
-  uint32_t matrix_size = m * n;
-  int grid = (matrix_size + BLOCK - 1) / BLOCK;
-
-  void (*kernel)(const FType*, FType*, uint32_t, uint32_t, uint32_t) = nullptr;
-
-  switch (n_matrix) {
-    case 4:
-      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 4>;
-      break;
-    case 5:
-      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 5>;
-      break;
-    case 6:
-      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 6>;
-      break;
-    case 7:
-      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 7>;
-      break;
-    case 8:
-      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 8>;
-      break;
-    default:
-      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, -1>;
-      break;
-  }
-
-  kernel<<<grid, BLOCK, 0, stream>>>(C_split, C, n, n_matrix, matrix_size);
-}
-
-}  // namespace allspark
-
diff --git a/diffulex_kernel/csrc/marlin/torch_bindings_marlin.cpp b/diffulex_kernel/csrc/marlin/torch_bindings_marlin.cpp
deleted file mode 100644
index c8a8586..0000000
--- a/diffulex_kernel/csrc/marlin/torch_bindings_marlin.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-#include <torch/extension.h>
-#include <c10/cuda/CUDAGuard.h>
-
-// Forward declarations implemented in .cu files.
-torch::Tensor allspark_w8a16_gemm(
-    torch::Tensor const& a, torch::Tensor const& b_qweight,
-    torch::Tensor const& b_scales, c10::optional<torch::Tensor> const& b_qzeros,
-    int64_t n, int64_t group_size, int64_t sm_count, int64_t sm_version,
-    int64_t CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder);
-
-void rearrange_kn_weight_as_n32k16_order(
-    torch::Tensor const& b_qweight, torch::Tensor const& b_scales,
-    c10::optional<torch::Tensor> const& b_zeros, bool has_zp,
-    torch::Tensor& b_qweight_reorder, torch::Tensor& b_scales_reorder,
-    c10::optional<torch::Tensor> const& b_zeros_reorder, int64_t K, int64_t N,
-    int64_t N_32align);
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("allspark_w8a16_gemm", &allspark_w8a16_gemm,
-        "AllSpark W8A16 fused GEMM (uint8 weight bias128 + bf16/fp16 act)");
-  m.def("rearrange_kn_weight_as_n32k16_order",
-        &rearrange_kn_weight_as_n32k16_order,
-        "Repack (K,N) uint8 weight into N32K16 order + reorder/pad scales");
-}
-
diff --git a/diffulex_kernel/python/marlin_ops.py b/diffulex_kernel/python/marlin_ops.py
deleted file mode 100644
index caefd47..0000000
--- a/diffulex_kernel/python/marlin_ops.py
+++ /dev/null
@@ -1,128 +0,0 @@
-from __future__ import annotations
-
-import os
-from pathlib import Path
-from typing import Optional
-
-import torch
-
-
-_EXT: Optional[object] = None
-_EXT_ERR: Optional[BaseException] = None
-
-
-def _build_extension() -> object:
-    # Allow disabling compilation in constrained environments.
-    if os.getenv("DIFFULEX_DISABLE_MARLIN", "0") == "1":
-        raise RuntimeError("DIFFULEX_DISABLE_MARLIN=1 (disabled)")
-
-    this_dir = Path(__file__).resolve().parent
-    # this_dir = Diffulex/diffulex_kernel/python
-    # parents[0]=Diffulex/diffulex_kernel, parents[1]=Diffulex
-    repo_root = this_dir.parents[1]  # Diffulex/
-    csrc_dir = repo_root / "diffulex_kernel" / "csrc" / "marlin"
-
-    sources = [
-        str(csrc_dir / "torch_bindings_marlin.cpp"),
-        str(csrc_dir / "allspark_repack.cu"),
-        str(csrc_dir / "allspark_qgemm_w8a16.cu"),
-    ]
-
-    # Build via torch cpp_extension
-    from torch.utils.cpp_extension import load  # lazy import
-
-    extra_cflags = ["-O3"]
-    extra_cuda_cflags = ["-O3", "--use_fast_math"]
-    extra_ldflags = ["-lcublas"]
-
-    # Use a stable extension name so torch caches it in ~/.cache/torch_extensions.
-    name = "diffulex_marlin_allspark_w8a16"
-
-    return load(
-        name=name,
-        sources=sources,
-        extra_cflags=extra_cflags,
-        extra_cuda_cflags=extra_cuda_cflags,
-        extra_ldflags=extra_ldflags,
-        with_cuda=True,
-        verbose=os.getenv("DIFFULEX_MARLIN_VERBOSE_BUILD", "0") == "1",
-    )
-
-
-def _get_ext() -> object:
-    global _EXT, _EXT_ERR
-    if _EXT is not None:
-        return _EXT
-    if _EXT_ERR is not None:
-        raise _EXT_ERR
-    try:
-        _EXT = _build_extension()
-        return _EXT
-    except BaseException as e:
-        _EXT_ERR = e
-        raise
-
-
-def is_available() -> bool:
-    try:
-        _ = _get_ext()
-        return True
-    except BaseException:
-        return False
-
-
-def allspark_w8a16_gemm(
-    a: torch.Tensor,
-    b_qweight: torch.Tensor,
-    b_scales: torch.Tensor,
-    b_qzeros: Optional[torch.Tensor],
-    n: int,
-    group_size: int,
-    sm_count: int,
-    sm_version: int,
-    cublas_m_threshold: int,
-    has_zp: bool,
-    n32k16_reorder: bool,
-) -> torch.Tensor:
-    ext = _get_ext()
-    return ext.allspark_w8a16_gemm(
-        a,
-        b_qweight,
-        b_scales,
-        b_qzeros,
-        n,
-        group_size,
-        sm_count,
-        sm_version,
-        cublas_m_threshold,
-        has_zp,
-        n32k16_reorder,
-    )
-
-
-def rearrange_kn_weight_as_n32k16_order(
-    b_qweight_kn: torch.Tensor,
-    b_scales: torch.Tensor,
-    b_zeros: Optional[torch.Tensor],
-    has_zp: bool,
-    b_qweight_reorder: torch.Tensor,
-    b_scales_reorder: torch.Tensor,
-    b_zeros_reorder: Optional[torch.Tensor],
-    K: int,
-    N: int,
-    N_32align: int,
-) -> None:
-    ext = _get_ext()
-    return ext.rearrange_kn_weight_as_n32k16_order(
-        b_qweight_kn,
-        b_scales,
-        b_zeros,
-        has_zp,
-        b_qweight_reorder,
-        b_scales_reorder,
-        b_zeros_reorder,
-        K,
-        N,
-        N_32align,
-    )
-
diff --git a/docs/GPTQ_AWQ_SUPPORT.md b/docs/GPTQ_AWQ_SUPPORT.md
deleted file mode 100644
index 659028b..0000000
--- a/docs/GPTQ_AWQ_SUPPORT.md
+++ /dev/null
@@ -1,233 +0,0 @@
-# GPTQ/AWQ 支持
-
-Diffulex 现在支持加载 GPTQ 和 AWQ 格式的离线量化权重，并进行推理。
-
-## 功能概述
-
-- **GPTQ 支持**: 支持加载 AutoGPTQ 格式的量化 checkpoint（W4A16，weight-only）
-- **AWQ 支持**: 支持加载 AWQ 格式的量化 checkpoint（W4A16，weight-only）
-- **离线量化**: 直接从 checkpoint 加载已量化的权重，无需先加载 bf16 再量化
-- **权重缓存**: 自动缓存反量化后的权重，避免每次 forward 都重新反量化
-
-## 使用方法
-
-### 步骤 1: 离线量化模型（可选）
-
-如果你有原始模型权重，可以使用 Diffulex 提供的量化脚本将其量化为 GPTQ/AWQ 格式：
-
-```bash
-# 量化模型为 GPTQ 格式
-python -m diffulex.utils.quantization.quantize_model \
-    --model-path /path/to/original/model \
-    --output-path /path/to/output \
-    --quant-format gptq \
-    --group-size 128 \
-    --bits 4
-
-# 量化模型为 AWQ 格式
-python -m diffulex.utils.quantization.quantize_model \
-    --model-path /path/to/original/model \
-    --output-path /path/to/output \
-    --quant-format awq \
-    --group-size 128 \
-    --bits 4
-```
-
-量化脚本会生成：
-- `model_quantized_{gptq|awq}.safetensors`: 包含量化权重的 safetensors 文件
-- `quantization_metadata_{gptq|awq}.json`: 量化元数据
-
-**注意**: 生成的量化权重文件需要与原始模型的配置文件（config.json）放在同一目录下，或者将量化权重文件复制到原始模型目录。
-
-### 步骤 2: 配置和加载
-
-在创建 `Config` 时，设置量化格式：
-
-```python
-from diffulex.config import Config
-
-config = Config(
-    model="/path/to/quantized/checkpoint",
-    model_name="dream",  # 或其他模型名称
-    linear_attn_weight_dtype="gptq",  # 或 "awq"
-    linear_mlp_weight_dtype="gptq",   # 或 "awq"
-    linear_attn_act_dtype="bf16",
-    linear_mlp_act_dtype="bf16",
-    tensor_parallel_size=1,  # 当前仅支持 TP=1
-    # ... 其他配置
-)
-```
-
-### Checkpoint 格式
-
-#### GPTQ Checkpoint
-
-GPTQ checkpoint 应包含以下 keys（在 `.safetensors` 文件中）：
-- `{module_name}.qweight`: int8 打包的 int4 权重 [out_features, (in_features + 1) // 2]
-- `{module_name}.qzeros`: int8 打包的 int4 零点 [num_groups, (in_features + 1) // 2]
-- `{module_name}.scales`: float32 每组的 scales [num_groups, in_features] 或 [num_groups]
-- `{module_name}.g_idx`: (可选) int32 组索引 [out_features]
-
-#### AWQ Checkpoint
-
-AWQ checkpoint 应包含以下 keys（在 `.safetensors` 文件中）：
-- `{module_name}.qweight`: int8 打包的 int4 权重 [out_features, (in_features + 1) // 2]
-- `{module_name}.qzeros`: int8 打包的 int4 零点 [num_groups, (in_features + 1) // 2]
-- `{module_name}.scales`: float32 每组的 scales [num_groups, in_features] 或 [num_groups]
-
-注意：AWQ 不使用 `g_idx`，采用顺序分组（group_id = out_idx // group_size）。
-
-## 限制
-
-### Tensor Parallel
-
-当前实现仅支持 `tensor_parallel_size=1`（单 GPU）。如果使用 `tensor_parallel_size > 1`，系统会给出警告并跳过离线量化权重的加载。如果需要支持 TP>1，请提供实际的 checkpoint 以便实现 TP 切分逻辑。
-
-### 量化格式
-
-当前仅支持 W4A16（weight int4 + activation bf16）。不支持激活量化。
-
-### 量化工具兼容性
-
-- **GPTQ**: 兼容 AutoGPTQ 和 GPTQ-for-LLaMa 生成的 checkpoint
-- **AWQ**: 兼容 AWQ 工具生成的 checkpoint
-
-## 测试
-
-### 运行单元测试
-
-```bash
-# 运行 GPTQ/AWQ 策略单元测试
-pytest tests/test_gptq_awq_strategies.py -v
-```
-
-### 运行加载测试示例
-
-```bash
-# 测试 GPTQ checkpoint 加载
-python examples/test_gptq_awq_loading.py \
-    --format gptq \
-    --model-path /path/to/gptq/checkpoint \
-    --list-layers \
-    --test-forward
-
-# 测试 AWQ checkpoint 加载
-python examples/test_gptq_awq_loading.py \
-    --format awq \
-    --model-path /path/to/awq/checkpoint \
-    --list-layers \
-    --test-forward
-```
-
-### 运行端到端生成测试
-
-使用 `test_quantization_generation.py` 可以测试量化模型的完整推理流程：
-
-```bash
-# 测试 GPTQ 策略的文本生成
-python examples/test_quantization_generation.py \
-    --gptq \
-    --model-path /path/to/quantized/model \
-    --max-tokens 50
-
-# 测试 AWQ 策略的文本生成
-python examples/test_quantization_generation.py \
-    --awq \
-    --model-path /path/to/quantized/model \
-    --max-tokens 50
-
-# 测试特定策略组合
-python examples/test_quantization_generation.py \
-    --strategies gptq_w4a16_bf16kv,awq_w4a16_fp8kv \
-    --model-path /path/to/quantized/model
-```
-
-### 完整工作流程示例
-
-```bash
-# 1. 量化原始模型为 GPTQ 格式
-python -m diffulex.utils.quantization.quantize_model \
-    --model-path /data1/ckpts/Dream-org/Dream-v0-Base-7B \
-    --output-path /tmp/quantized_model \
-    --quant-format gptq \
-    --group-size 128 \
-    --bits 4
-
-# 2. 将量化权重复制到模型目录（或直接使用输出目录）
-cp /tmp/quantized_model/model_quantized_gptq.safetensors \
-   /data1/ckpts/Dream-org/Dream-v0-Base-7B/
-
-# 3. 运行端到端测试
-python examples/test_quantization_generation.py \
-    --gptq \
-    --model-path /data1/ckpts/Dream-org/Dream-v0-Base-7B \
-    --max-tokens 50
-```
-
-## 实现细节
-
-### 策略实现
-
-- `LinearGPTQW4A16Strategy`: GPTQ W4A16 策略，实现 GPTQ 格式的反量化
-- `LinearAWQW4A16Strategy`: AWQ W4A16 策略，实现 AWQ 格式的反量化
-
-### 权重存储
-
-离线量化权重存储在 `LinearBase` 的 buffers 中：
-- GPTQ: `gptq_qweight`, `gptq_qzeros`, `gptq_scales`, `gptq_g_idx`
-- AWQ: `awq_qweight`, `awq_qzeros`, `awq_scales`
-
-### 前向传播
-
-在 `LinearBase.forward()` 中：
-1. 首先检查是否有离线量化权重（`has_offline_quantized_weight()`）
-2. 如果有，将 GPTQ/AWQ 参数传递给 strategy 的 `linear_forward()`
-3. Strategy 反量化权重（带缓存），然后使用 `F.linear()` 计算
-
-### 加载流程
-
-在 `load_model()` 中：
-1. 首先尝试加载离线量化权重（`_load_gptq_awq_weights()`）
-2. 扫描 `.safetensors` 文件中的 keys，识别 GPTQ/AWQ 格式的权重
-3. 找到对应的 module，调用 `set_offline_quantized_weight()`
-4. 跳过常规的 bf16 权重加载（已加载离线量化权重时）
-
-## 性能说明
-
-- **内存**: 离线量化权重（packed int4）显著减少内存占用
-- **速度**: 当前实现使用 Python 反量化 + `F.linear()`，可能有性能开销
-- **缓存**: Strategy 会缓存反量化后的权重，避免重复反量化
-
-未来可以考虑：
-- 实现 TileLang kernel 直接使用 packed 权重进行计算
-- 支持更多量化格式（如 W8A16, W4A8）
-
-## 故障排除
-
-### 问题：无法找到模块
-
-如果遇到 "无法找到模块" 的警告，检查：
-1. Checkpoint 中的 key 命名是否与模型中的模块名称匹配
-2. 如果使用 `packed_modules_mapping`，确保映射正确
-
-### 问题：Tensor Parallel > 1
-
-如果使用 TP>1，当前实现会跳过离线量化权重加载。解决方案：
-1. 使用 TP=1（单 GPU）
-2. 或提供实际的 checkpoint 以完善 TP 切分逻辑
-
-### 问题：量化权重未加载
-
-检查：
-1. Config 中的 `linear_attn_weight_dtype` 和 `linear_mlp_weight_dtype` 是否设置为 "gptq" 或 "awq"
-2. Checkpoint 是否包含必要的 keys（qweight, qzeros, scales）
-3. 查看加载日志中的警告信息
-
-## 相关文件
-
-- `diffulex/utils/quantization/strategies/linear_gptq_w4a16.py`: GPTQ 策略实现
-- `diffulex/utils/quantization/strategies/linear_awq_w4a16.py`: AWQ 策略实现
-- `diffulex/layer/linear.py`: LinearBase 扩展，支持离线量化权重
-- `diffulex/utils/loader.py`: 权重加载逻辑，支持 GPTQ/AWQ checkpoint
-- `tests/test_gptq_awq_strategies.py`: 单元测试
-- `examples/test_gptq_awq_loading.py`: 加载测试示例

From 16d7892b81a9416c8ffbeaf7525e1408e3029709 Mon Sep 17 00:00:00 2001
From: luozixin2 <lzx@diffulex.local>
Date: Sun, 18 Jan 2026 05:44:05 +0000
Subject: [PATCH 02/10] =?UTF-8?q?chore:=20=E4=BB=8E=E4=BB=93=E5=BA=93?=
 =?UTF-8?q?=E7=A7=BB=E9=99=A4=20benchmark=5Fresults?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

benchmark_results 是本地生成的评测产物，不应进入版本库。
本提交将其作为正常删除移出，并依赖 .gitignore 中的 benchmark_results/ 规则避免后续再次提交。
---
 .../results_2026-01-14T02-04-10.705764.json   | 181 ------------------
 .../results_2026-01-14T02-11-04.186162.json   | 181 ------------------
 .../results_2026-01-14T03-41-09.193046.json   | 181 ------------------
 .../results_2026-01-14T04-18-42.020277.json   | 181 ------------------
 .../results_2026-01-14T04-43-18.972334.json   | 181 ------------------
 .../results_2026-01-14T04-47-36.884326.json   | 181 ------------------
 .../results_2026-01-14T04-51-16.766193.json   | 181 ------------------
 .../results_2026-01-14T04-55-08.952802.json   | 181 ------------------
 .../results_2026-01-14T04-58-59.498191.json   | 181 ------------------
 .../results_2026-01-14T05-48-34.597841.json   | 181 ------------------
 .../results_2026-01-14T05-52-54.536893.json   | 181 ------------------
 .../results_2026-01-14T05-59-12.945984.json   | 181 ------------------
 .../results_2026-01-14T06-03-53.672573.json   | 181 ------------------
 .../results_2026-01-14T11-49-42.254286.json   | 181 ------------------
 .../results_2026-01-14T11-53-37.370120.json   | 181 ------------------
 .../results_2026-01-14T11-58-59.108906.json   | 181 ------------------
 .../results_2026-01-14T12-04-04.491785.json   | 181 ------------------
 .../results_2026-01-14T12-09-47.508528.json   | 181 ------------------
 .../results_2026-01-14T15-45-49.353615.json   | 181 ------------------
 .../results_2026-01-14T16-45-59.634565.json   | 181 ------------------
 .../results_2026-01-15T04-55-58.154304.json   | 181 ------------------
 .../results_2026-01-15T05-46-59.855795.json   | 181 ------------------
 .../results_2026-01-15T06-18-39.327696.json   | 181 ------------------
 .../results_2026-01-15T06-59-56.307819.json   | 181 ------------------
 .../results_2026-01-15T07-06-43.757074.json   | 181 ------------------
 .../results_2026-01-15T07-14-04.316097.json   | 181 ------------------
 .../results_2026-01-15T07-21-50.299005.json   | 181 ------------------
 .../results_2026-01-15T07-25-14.505348.json   | 181 ------------------
 .../results_2026-01-15T07-28-46.947266.json   | 181 ------------------
 .../results_2026-01-15T07-30-48.854429.json   | 181 ------------------
 .../results_2026-01-15T07-34-25.552524.json   | 181 ------------------
 .../results_2026-01-15T09-20-39.192357.json   | 181 ------------------
 .../results_2026-01-15T09-42-38.297326.json   | 181 ------------------
 .../results_2026-01-16T08-01-09.241731.json   | 181 ------------------
 .../results_2026-01-16T08-02-34.598239.json   | 181 ------------------
 .../results_2026-01-16T10-52-43.236033.json   | 176 -----------------
 .../results_2026-01-16T07-55-37.824548.json   | 176 -----------------
 .../results_2026-01-16T10-55-28.003281.json   | 176 -----------------
 .../results_2026-01-16T13-13-39.902007.json   | 176 -----------------
 .../results_2026-01-16T13-17-27.453222.json   | 176 -----------------
 .../results_2026-01-16T11-53-35.800494.json   | 176 -----------------
 .../results_2026-01-16T12-11-26.946690.json   | 176 -----------------
 .../results_2026-01-15T11-03-50.486126.json   | 181 ------------------
 43 files changed, 7748 deletions(-)
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T02-04-10.705764.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T02-11-04.186162.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T03-41-09.193046.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-18-42.020277.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-43-18.972334.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-47-36.884326.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-51-16.766193.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-55-08.952802.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-58-59.498191.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-48-34.597841.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-52-54.536893.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-59-12.945984.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T06-03-53.672573.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-49-42.254286.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-53-37.370120.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-58-59.108906.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T12-04-04.491785.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T12-09-47.508528.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T15-45-49.353615.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T16-45-59.634565.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T04-55-58.154304.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T05-46-59.855795.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T06-18-39.327696.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T06-59-56.307819.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-06-43.757074.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-14-04.316097.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-21-50.299005.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-25-14.505348.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-28-46.947266.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-30-48.854429.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-34-25.552524.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T09-20-39.192357.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T09-42-38.297326.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T08-01-09.241731.json
 delete mode 100644 benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T08-02-34.598239.json
 delete mode 100644 benchmark_results/bf16_baseline/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T10-52-43.236033.json
 delete mode 100644 benchmark_results/distinct_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T07-55-37.824548.json
 delete mode 100644 benchmark_results/marlin_int8/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T10-55-28.003281.json
 delete mode 100644 benchmark_results/marlin_w8a16_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T13-13-39.902007.json
 delete mode 100644 benchmark_results/marlin_w8a16_fp8kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T13-17-27.453222.json
 delete mode 100644 benchmark_results/w4a16_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T11-53-35.800494.json
 delete mode 100644 benchmark_results/w4a16_bf16kv_retest/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T12-11-26.946690.json
 delete mode 100644 benchmark_results/w8a8_bf16kv_varlen_gpu1/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T11-03-50.486126.json

diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T02-04-10.705764.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T02-04-10.705764.json
deleted file mode 100644
index a80e7a7..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T02-04-10.705764.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.7,
-      "exact_match_stderr,strict-match": 0.15275252316519466,
-      "exact_match,flexible-extract": 0.7,
-      "exact_match_stderr,flexible-extract": 0.15275252316519466
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "fp8_e4m3",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int8",
-        "linear_mlp_weight_dtype": "int8",
-        "linear_attn_act_dtype": "int8",
-        "linear_mlp_act_dtype": "int8"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "9015510",
-  "date": 1768356025.7891467,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            2140.005\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1789128.396624866,
-  "end_time": 1789354.925772734,
-  "total_evaluation_time_seconds": "226.52914786804467"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T02-11-04.186162.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T02-11-04.186162.json
deleted file mode 100644
index 40affbc..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T02-11-04.186162.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.5,
-      "exact_match_stderr,strict-match": 0.16666666666666666,
-      "exact_match,flexible-extract": 0.5,
-      "exact_match_stderr,flexible-extract": 0.16666666666666666
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "fp8_e4m3",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int4",
-        "linear_mlp_weight_dtype": "int4",
-        "linear_attn_act_dtype": "bf16",
-        "linear_mlp_act_dtype": "bf16"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int4,linear_mlp_weight_dtype=int4,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "9015510",
-  "date": 1768356439.7073195,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1593.549\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1789542.332314613,
-  "end_time": 1789768.406157205,
-  "total_evaluation_time_seconds": "226.07384259207174"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T03-41-09.193046.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T03-41-09.193046.json
deleted file mode 100644
index 282d2b0..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T03-41-09.193046.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.0,
-      "exact_match_stderr,strict-match": 0.0,
-      "exact_match,flexible-extract": 0.0,
-      "exact_match_stderr,flexible-extract": 0.0
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "bf16",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int4",
-        "linear_mlp_weight_dtype": "int4",
-        "linear_attn_act_dtype": "int8",
-        "linear_mlp_act_dtype": "int8"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int4,linear_mlp_weight_dtype=int4,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "9015510",
-  "date": 1768361751.1483748,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            3732.449\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1794853.740878506,
-  "end_time": 1795173.413076659,
-  "total_evaluation_time_seconds": "319.6721981528681"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-18-42.020277.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-18-42.020277.json
deleted file mode 100644
index 8914c97..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-18-42.020277.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.0,
-      "exact_match_stderr,strict-match": 0.0,
-      "exact_match,flexible-extract": 0.0,
-      "exact_match_stderr,flexible-extract": 0.0
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "fp8_e4m3",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int4",
-        "linear_mlp_weight_dtype": "int4",
-        "linear_attn_act_dtype": "int8",
-        "linear_mlp_act_dtype": "int8"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int4,linear_mlp_weight_dtype=int4,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "9015510",
-  "date": 1768363943.7679768,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1491.481\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1797046.361654856,
-  "end_time": 1797426.24030518,
-  "total_evaluation_time_seconds": "379.8786503239535"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-43-18.972334.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-43-18.972334.json
deleted file mode 100644
index 978adda..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-43-18.972334.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.6,
-      "exact_match_stderr,strict-match": 0.1632993161855452,
-      "exact_match,flexible-extract": 0.6,
-      "exact_match_stderr,flexible-extract": 0.1632993161855452
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "fp8_e4m3",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "bf16",
-        "linear_mlp_weight_dtype": "bf16",
-        "linear_attn_act_dtype": "bf16",
-        "linear_mlp_act_dtype": "bf16"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=bf16,linear_mlp_weight_dtype=bf16,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "9015510",
-  "date": 1768365582.3947966,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1500.810\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1798685.024369323,
-  "end_time": 1798903.192362522,
-  "total_evaluation_time_seconds": "218.16799319908023"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-47-36.884326.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-47-36.884326.json
deleted file mode 100644
index ef184cb..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-47-36.884326.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.7,
-      "exact_match_stderr,strict-match": 0.15275252316519466,
-      "exact_match,flexible-extract": 0.7,
-      "exact_match_stderr,flexible-extract": 0.15275252316519466
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "fp8_e4m3",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int8",
-        "linear_mlp_weight_dtype": "int8",
-        "linear_attn_act_dtype": "bf16",
-        "linear_mlp_act_dtype": "bf16"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "9015510",
-  "date": 1768365853.3005438,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1528.854\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1798955.948296099,
-  "end_time": 1799161.104330701,
-  "total_evaluation_time_seconds": "205.15603460208513"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-51-16.766193.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-51-16.766193.json
deleted file mode 100644
index c5b573f..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-51-16.766193.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.6,
-      "exact_match_stderr,strict-match": 0.1632993161855452,
-      "exact_match,flexible-extract": 0.6,
-      "exact_match_stderr,flexible-extract": 0.1632993161855452
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "fp8_e4m3",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "bf16",
-        "linear_mlp_weight_dtype": "bf16",
-        "linear_attn_act_dtype": "bf16",
-        "linear_mlp_act_dtype": "bf16"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=bf16,linear_mlp_weight_dtype=bf16,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "9015510",
-  "date": 1768366081.895554,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1497.639\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1799184.523418341,
-  "end_time": 1799380.986230154,
-  "total_evaluation_time_seconds": "196.46281181299128"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-55-08.952802.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-55-08.952802.json
deleted file mode 100644
index 7e7d5b8..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-55-08.952802.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.5,
-      "exact_match_stderr,strict-match": 0.16666666666666666,
-      "exact_match,flexible-extract": 0.5,
-      "exact_match_stderr,flexible-extract": 0.16666666666666666
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "fp8_e4m3",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int4",
-        "linear_mlp_weight_dtype": "int4",
-        "linear_attn_act_dtype": "bf16",
-        "linear_mlp_act_dtype": "bf16"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int4,linear_mlp_weight_dtype=int4,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "9015510",
-  "date": 1768366299.0156336,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1527.472\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1799401.649744756,
-  "end_time": 1799613.172823041,
-  "total_evaluation_time_seconds": "211.52307828492485"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-58-59.498191.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-58-59.498191.json
deleted file mode 100644
index 4257038..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T04-58-59.498191.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.7,
-      "exact_match_stderr,strict-match": 0.15275252316519466,
-      "exact_match,flexible-extract": 0.7,
-      "exact_match_stderr,flexible-extract": 0.15275252316519466
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "fp8_e4m3",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int8",
-        "linear_mlp_weight_dtype": "int8",
-        "linear_attn_act_dtype": "int8",
-        "linear_mlp_act_dtype": "int8"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "9015510",
-  "date": 1768366534.555966,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1502.276\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1799637.195420527,
-  "end_time": 1799843.71819926,
-  "total_evaluation_time_seconds": "206.5227787331678"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-48-34.597841.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-48-34.597841.json
deleted file mode 100644
index b07c88c..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-48-34.597841.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.7,
-      "exact_match_stderr,strict-match": 0.15275252316519466,
-      "exact_match,flexible-extract": 0.7,
-      "exact_match_stderr,flexible-extract": 0.15275252316519466
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "bf16",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int8",
-        "linear_mlp_weight_dtype": "int8",
-        "linear_attn_act_dtype": "bf16",
-        "linear_mlp_act_dtype": "bf16"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "9015510",
-  "date": 1768369410.5716164,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1527.561\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1802513.189486472,
-  "end_time": 1802818.817811945,
-  "total_evaluation_time_seconds": "305.6283254730515"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-52-54.536893.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-52-54.536893.json
deleted file mode 100644
index 48ffc32..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-52-54.536893.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.7,
-      "exact_match_stderr,strict-match": 0.15275252316519466,
-      "exact_match,flexible-extract": 0.7,
-      "exact_match_stderr,flexible-extract": 0.15275252316519466
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "bf16",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int8",
-        "linear_mlp_weight_dtype": "int8",
-        "linear_attn_act_dtype": "bf16",
-        "linear_mlp_act_dtype": "bf16"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "9015510",
-  "date": 1768369763.5526166,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1522.516\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1802866.077694308,
-  "end_time": 1803078.756933341,
-  "total_evaluation_time_seconds": "212.6792390330229"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-59-12.945984.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-59-12.945984.json
deleted file mode 100644
index 74b0450..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T05-59-12.945984.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.8,
-      "exact_match_stderr,strict-match": 0.13333333333333333,
-      "exact_match,flexible-extract": 0.8,
-      "exact_match_stderr,flexible-extract": 0.13333333333333333
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "bf16",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int8",
-        "linear_mlp_weight_dtype": "int8",
-        "linear_attn_act_dtype": "bf16",
-        "linear_mlp_act_dtype": "bf16"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "9015510",
-  "date": 1768370149.2326508,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1490.867\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1803251.863238188,
-  "end_time": 1803457.166028014,
-  "total_evaluation_time_seconds": "205.3027898259461"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T06-03-53.672573.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T06-03-53.672573.json
deleted file mode 100644
index c0dafdb..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T06-03-53.672573.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.8,
-      "exact_match_stderr,strict-match": 0.13333333333333333,
-      "exact_match,flexible-extract": 0.8,
-      "exact_match_stderr,flexible-extract": 0.13333333333333333
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "bf16",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int8",
-        "linear_mlp_weight_dtype": "int8",
-        "linear_attn_act_dtype": "bf16",
-        "linear_mlp_act_dtype": "bf16"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "9015510",
-  "date": 1768370425.8403845,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1461.316\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1803528.438604511,
-  "end_time": 1803737.892584348,
-  "total_evaluation_time_seconds": "209.45397983700968"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-49-42.254286.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-49-42.254286.json
deleted file mode 100644
index 7fe7705..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-49-42.254286.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.6,
-      "exact_match_stderr,strict-match": 0.1632993161855452,
-      "exact_match,flexible-extract": 0.6,
-      "exact_match_stderr,flexible-extract": 0.1632993161855452
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "fp8_e4m3",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "bf16",
-        "linear_mlp_weight_dtype": "bf16",
-        "linear_attn_act_dtype": "bf16",
-        "linear_mlp_act_dtype": "bf16"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=bf16,linear_mlp_weight_dtype=bf16,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768391187.4083443,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            3650.396\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1824289.982823392,
-  "end_time": 1824486.47430543,
-  "total_evaluation_time_seconds": "196.4914820380509"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-53-37.370120.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-53-37.370120.json
deleted file mode 100644
index 63d21fd..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-53-37.370120.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.7,
-      "exact_match_stderr,strict-match": 0.15275252316519466,
-      "exact_match,flexible-extract": 0.7,
-      "exact_match_stderr,flexible-extract": 0.15275252316519466
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "fp8_e4m3",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int8",
-        "linear_mlp_weight_dtype": "int8",
-        "linear_attn_act_dtype": "bf16",
-        "linear_mlp_act_dtype": "bf16"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768391414.3830173,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1497.653\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1824517.005980151,
-  "end_time": 1824721.590130714,
-  "total_evaluation_time_seconds": "204.58415056299418"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-58-59.108906.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-58-59.108906.json
deleted file mode 100644
index db04e77..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T11-58-59.108906.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.7,
-      "exact_match_stderr,strict-match": 0.15275252316519466,
-      "exact_match,flexible-extract": 0.7,
-      "exact_match_stderr,flexible-extract": 0.15275252316519466
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "fp8_e4m3",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int8",
-        "linear_mlp_weight_dtype": "int8",
-        "linear_attn_act_dtype": "int8",
-        "linear_mlp_act_dtype": "int8"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768391734.7186475,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1494.172\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1824837.359390208,
-  "end_time": 1825043.32890774,
-  "total_evaluation_time_seconds": "205.96951753203757"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T12-04-04.491785.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T12-04-04.491785.json
deleted file mode 100644
index 00c8f21..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T12-04-04.491785.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.5,
-      "exact_match_stderr,strict-match": 0.16666666666666666,
-      "exact_match,flexible-extract": 0.5,
-      "exact_match_stderr,flexible-extract": 0.16666666666666666
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "fp8_e4m3",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int4",
-        "linear_mlp_weight_dtype": "int4",
-        "linear_attn_act_dtype": "bf16",
-        "linear_mlp_act_dtype": "bf16"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int4,linear_mlp_weight_dtype=int4,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768392034.8285484,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1497.662\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1825137.448681286,
-  "end_time": 1825348.711802461,
-  "total_evaluation_time_seconds": "211.26312117488123"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T12-09-47.508528.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T12-09-47.508528.json
deleted file mode 100644
index 41f1421..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T12-09-47.508528.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.0,
-      "exact_match_stderr,strict-match": 0.0,
-      "exact_match,flexible-extract": 0.0,
-      "exact_match_stderr,flexible-extract": 0.0
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "fp8_e4m3",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int4",
-        "linear_mlp_weight_dtype": "int4",
-        "linear_attn_act_dtype": "int8",
-        "linear_mlp_act_dtype": "int8"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int4,linear_mlp_weight_dtype=int4,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768392334.712297,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1497.656\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1825437.345900828,
-  "end_time": 1825691.728569024,
-  "total_evaluation_time_seconds": "254.38266819599085"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T15-45-49.353615.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T15-45-49.353615.json
deleted file mode 100644
index e358275..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T15-45-49.353615.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.0,
-      "exact_match_stderr,strict-match": 0.0,
-      "exact_match,flexible-extract": 0.0,
-      "exact_match_stderr,flexible-extract": 0.0
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "bf16",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int8",
-        "linear_mlp_weight_dtype": "int8",
-        "linear_attn_act_dtype": "int8",
-        "linear_mlp_act_dtype": "int8"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768404498.8850982,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            2124.741\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1837601.495609296,
-  "end_time": 1838653.573537493,
-  "total_evaluation_time_seconds": "1052.0779281968717"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T16-45-59.634565.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T16-45-59.634565.json
deleted file mode 100644
index a13ca11..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-14T16-45-59.634565.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.0,
-      "exact_match_stderr,strict-match": 0.0,
-      "exact_match,flexible-extract": 0.0,
-      "exact_match_stderr,flexible-extract": 0.0
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "bf16",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int8",
-        "linear_mlp_weight_dtype": "int8",
-        "linear_attn_act_dtype": "int8",
-        "linear_mlp_act_dtype": "int8"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768408375.740674,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1557.502\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1841478.394626493,
-  "end_time": 1842263.854595871,
-  "total_evaluation_time_seconds": "785.4599693778437"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T04-55-58.154304.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T04-55-58.154304.json
deleted file mode 100644
index fd83f64..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T04-55-58.154304.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.0,
-      "exact_match_stderr,strict-match": 0.0,
-      "exact_match,flexible-extract": 0.0,
-      "exact_match_stderr,flexible-extract": 0.0
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "bf16",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int4",
-        "linear_mlp_weight_dtype": "int4",
-        "linear_attn_act_dtype": "int8",
-        "linear_mlp_act_dtype": "int8"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int4,linear_mlp_weight_dtype=int4,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768452507.2101202,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1497.663\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1885609.859757339,
-  "end_time": 1886062.374325558,
-  "total_evaluation_time_seconds": "452.51456821896136"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T05-46-59.855795.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T05-46-59.855795.json
deleted file mode 100644
index c3adb45..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T05-46-59.855795.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.7,
-      "exact_match_stderr,strict-match": 0.15275252316519466,
-      "exact_match,flexible-extract": 0.7,
-      "exact_match_stderr,flexible-extract": 0.15275252316519466
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.9,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "bf16",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int8",
-        "linear_mlp_weight_dtype": "int8",
-        "linear_attn_act_dtype": "int8",
-        "linear_mlp_act_dtype": "int8"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.9,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768455665.4585254,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1467.919\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1888768.08363602,
-  "end_time": 1889124.075778221,
-  "total_evaluation_time_seconds": "355.99214220093563"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T06-18-39.327696.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T06-18-39.327696.json
deleted file mode 100644
index aab1c38..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T06-18-39.327696.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.7,
-      "exact_match_stderr,strict-match": 0.15275252316519466,
-      "exact_match,flexible-extract": 0.7,
-      "exact_match_stderr,flexible-extract": 0.15275252316519466
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.9,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "bf16",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int8",
-        "linear_mlp_weight_dtype": "int8",
-        "linear_attn_act_dtype": "int8",
-        "linear_mlp_act_dtype": "int8"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.9,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768457541.6380894,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1880.764\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1890644.263511728,
-  "end_time": 1891023.547726645,
-  "total_evaluation_time_seconds": "379.28421491687186"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T06-59-56.307819.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T06-59-56.307819.json
deleted file mode 100644
index 99287bc..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T06-59-56.307819.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.6,
-      "exact_match_stderr,strict-match": 0.1632993161855452,
-      "exact_match,flexible-extract": 0.6,
-      "exact_match_stderr,flexible-extract": 0.1632993161855452
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "bf16",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "bf16",
-        "linear_mlp_weight_dtype": "bf16",
-        "linear_attn_act_dtype": "bf16",
-        "linear_mlp_act_dtype": "bf16"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=bf16,linear_mlp_weight_dtype=bf16,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768460202.442966,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1894.968\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1893305.076516158,
-  "end_time": 1893500.527809846,
-  "total_evaluation_time_seconds": "195.45129368803464"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-06-43.757074.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-06-43.757074.json
deleted file mode 100644
index fcf6ce2..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-06-43.757074.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.8,
-      "exact_match_stderr,strict-match": 0.13333333333333333,
-      "exact_match,flexible-extract": 0.8,
-      "exact_match_stderr,flexible-extract": 0.13333333333333333
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "bf16",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int8",
-        "linear_mlp_weight_dtype": "int8",
-        "linear_attn_act_dtype": "bf16",
-        "linear_mlp_act_dtype": "bf16"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768460425.250878,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1497.307\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1893527.886684797,
-  "end_time": 1893907.97709039,
-  "total_evaluation_time_seconds": "380.0904055929277"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-14-04.316097.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-14-04.316097.json
deleted file mode 100644
index 5bd64c4..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-14-04.316097.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.5,
-      "exact_match_stderr,strict-match": 0.16666666666666666,
-      "exact_match,flexible-extract": 0.5,
-      "exact_match_stderr,flexible-extract": 0.16666666666666666
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "bf16",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int4",
-        "linear_mlp_weight_dtype": "int4",
-        "linear_attn_act_dtype": "bf16",
-        "linear_mlp_act_dtype": "bf16"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int4,linear_mlp_weight_dtype=int4,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768460831.3954487,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1497.671\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1893934.036146669,
-  "end_time": 1894348.536118092,
-  "total_evaluation_time_seconds": "414.4999714230653"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-21-50.299005.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-21-50.299005.json
deleted file mode 100644
index c64e24a..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-21-50.299005.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.7,
-      "exact_match_stderr,strict-match": 0.15275252316519466,
-      "exact_match,flexible-extract": 0.7,
-      "exact_match_stderr,flexible-extract": 0.15275252316519466
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.9,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "bf16",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int8",
-        "linear_mlp_weight_dtype": "int8",
-        "linear_attn_act_dtype": "int8",
-        "linear_mlp_act_dtype": "int8"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.9,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768461253.6207416,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1557.544\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1894356.255002097,
-  "end_time": 1894814.519041443,
-  "total_evaluation_time_seconds": "458.26403934601694"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-25-14.505348.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-25-14.505348.json
deleted file mode 100644
index 25b9c34..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-25-14.505348.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.6,
-      "exact_match_stderr,strict-match": 0.1632993161855452,
-      "exact_match,flexible-extract": 0.6,
-      "exact_match_stderr,flexible-extract": 0.1632993161855452
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "fp8_e4m3",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "bf16",
-        "linear_mlp_weight_dtype": "bf16",
-        "linear_attn_act_dtype": "bf16",
-        "linear_mlp_act_dtype": "bf16"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=bf16,linear_mlp_weight_dtype=bf16,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768461719.8762195,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1497.702\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1894822.488835578,
-  "end_time": 1895018.725381989,
-  "total_evaluation_time_seconds": "196.23654641094618"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-28-46.947266.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-28-46.947266.json
deleted file mode 100644
index 01cf711..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-28-46.947266.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.7,
-      "exact_match_stderr,strict-match": 0.15275252316519466,
-      "exact_match,flexible-extract": 0.7,
-      "exact_match_stderr,flexible-extract": 0.15275252316519466
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "fp8_e4m3",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int8",
-        "linear_mlp_weight_dtype": "int8",
-        "linear_attn_act_dtype": "bf16",
-        "linear_mlp_act_dtype": "bf16"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768461923.7163112,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1787.592\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1895026.353534303,
-  "end_time": 1895231.167302567,
-  "total_evaluation_time_seconds": "204.81376826413907"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-30-48.854429.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-30-48.854429.json
deleted file mode 100644
index db0ff3f..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-30-48.854429.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.8,
-      "exact_match_stderr,strict-match": 0.13333333333333333,
-      "exact_match,flexible-extract": 0.8,
-      "exact_match_stderr,flexible-extract": 0.13333333333333333
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "fp8_e4m3",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int8",
-        "linear_mlp_weight_dtype": "int8",
-        "linear_attn_act_dtype": "int8",
-        "linear_mlp_act_dtype": "int8"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768462136.025923,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1470.020\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1895238.650535729,
-  "end_time": 1895353.074449915,
-  "total_evaluation_time_seconds": "114.42391418595798"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-34-25.552524.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-34-25.552524.json
deleted file mode 100644
index 12b4fe9..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T07-34-25.552524.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.5,
-      "exact_match_stderr,strict-match": 0.16666666666666666,
-      "exact_match,flexible-extract": 0.5,
-      "exact_match_stderr,flexible-extract": 0.16666666666666666
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "fp8_e4m3",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int4",
-        "linear_mlp_weight_dtype": "int4",
-        "linear_attn_act_dtype": "bf16",
-        "linear_mlp_act_dtype": "bf16"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int4,linear_mlp_weight_dtype=int4,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768462258.2675364,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1665.334\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1895360.899822849,
-  "end_time": 1895569.772539763,
-  "total_evaluation_time_seconds": "208.87271691393107"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T09-20-39.192357.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T09-20-39.192357.json
deleted file mode 100644
index 56f6d5f..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T09-20-39.192357.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.0,
-      "exact_match_stderr,strict-match": 0.0,
-      "exact_match,flexible-extract": 0.0,
-      "exact_match_stderr,flexible-extract": 0.0
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "bf16",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int4",
-        "linear_mlp_weight_dtype": "int4",
-        "linear_attn_act_dtype": "int8",
-        "linear_mlp_act_dtype": "int8"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int4,linear_mlp_weight_dtype=int4,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768468455.1741939,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1497.709\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1901557.821362432,
-  "end_time": 1901943.412388102,
-  "total_evaluation_time_seconds": "385.5910256698262"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T09-42-38.297326.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T09-42-38.297326.json
deleted file mode 100644
index 85f638e..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T09-42-38.297326.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.0,
-      "exact_match_stderr,strict-match": 0.0,
-      "exact_match,flexible-extract": 0.0,
-      "exact_match_stderr,flexible-extract": 0.0
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "fp8_e4m3",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int4",
-        "linear_mlp_weight_dtype": "int4",
-        "linear_attn_act_dtype": "int8",
-        "linear_mlp_act_dtype": "int8"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=varlen,linear_attn_weight_dtype=int4,linear_mlp_weight_dtype=int4,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768469772.4281907,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            3894.162\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1902875.03648783,
-  "end_time": 1903262.517333979,
-  "total_evaluation_time_seconds": "387.4808461489156"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T08-01-09.241731.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T08-01-09.241731.json
deleted file mode 100644
index 51495b9..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T08-01-09.241731.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.6,
-      "exact_match_stderr,strict-match": 0.1632993161855452,
-      "exact_match,flexible-extract": 0.6,
-      "exact_match_stderr,flexible-extract": 0.1632993161855452
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "distinct",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "bf16",
-        "decode_mode": "static",
-        "linear_attn_weight_dtype": "bf16",
-        "linear_mlp_weight_dtype": "bf16",
-        "linear_attn_act_dtype": "bf16",
-        "linear_mlp_act_dtype": "bf16"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=distinct,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=static,linear_attn_weight_dtype=bf16,linear_mlp_weight_dtype=bf16,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768550291.351751,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            3453.633\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1983393.981256467,
-  "end_time": 1983573.461770977,
-  "total_evaluation_time_seconds": "179.4805145098362"
-}
\ No newline at end of file
diff --git a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T08-02-34.598239.json b/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T08-02-34.598239.json
deleted file mode 100644
index b5e17ab..0000000
--- a/benchmark_results/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T08-02-34.598239.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.6,
-      "exact_match_stderr,strict-match": 0.1632993161855452,
-      "exact_match,flexible-extract": 0.6,
-      "exact_match_stderr,flexible-extract": 0.1632993161855452
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "distinct",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "fp8_e4m3",
-        "decode_mode": "static",
-        "linear_attn_weight_dtype": "bf16",
-        "linear_mlp_weight_dtype": "bf16",
-        "linear_attn_act_dtype": "bf16",
-        "linear_mlp_act_dtype": "bf16"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=distinct,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=fp8_e4m3,decode_mode=static,linear_attn_weight_dtype=bf16,linear_mlp_weight_dtype=bf16,linear_attn_act_dtype=bf16,linear_mlp_act_dtype=bf16",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768550486.1447546,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1791.992\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1983588.761090175,
-  "end_time": 1983658.81827102,
-  "total_evaluation_time_seconds": "70.05718084494583"
-}
\ No newline at end of file
diff --git a/benchmark_results/bf16_baseline/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T10-52-43.236033.json b/benchmark_results/bf16_baseline/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T10-52-43.236033.json
deleted file mode 100644
index 4668ff3..0000000
--- a/benchmark_results/bf16_baseline/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T10-52-43.236033.json
+++ /dev/null
@@ -1,176 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.8,
-      "exact_match_stderr,strict-match": 0.19999999999999998,
-      "exact_match,flexible-extract": 0.8,
-      "exact_match_stderr,flexible-extract": 0.19999999999999998
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "decode_mode": "varlen"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 5
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,decode_mode=varlen",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 5.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768560573.8532112,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1557.535\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1993676.412098808,
-  "end_time": 1993867.456066784,
-  "total_evaluation_time_seconds": "191.04396797600202"
-}
\ No newline at end of file
diff --git a/benchmark_results/distinct_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T07-55-37.824548.json b/benchmark_results/distinct_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T07-55-37.824548.json
deleted file mode 100644
index 4007f82..0000000
--- a/benchmark_results/distinct_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T07-55-37.824548.json
+++ /dev/null
@@ -1,176 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.6,
-      "exact_match_stderr,strict-match": 0.1632993161855452,
-      "exact_match,flexible-extract": 0.6,
-      "exact_match_stderr,flexible-extract": 0.1632993161855452
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "decode_mode": "varlen"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,decode_mode=varlen",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768549982.1742427,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1476.688\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1983084.777436124,
-  "end_time": 1983242.044567008,
-  "total_evaluation_time_seconds": "157.26713088410906"
-}
\ No newline at end of file
diff --git a/benchmark_results/marlin_int8/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T10-55-28.003281.json b/benchmark_results/marlin_int8/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T10-55-28.003281.json
deleted file mode 100644
index c5ba785..0000000
--- a/benchmark_results/marlin_int8/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T10-55-28.003281.json
+++ /dev/null
@@ -1,176 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.8,
-      "exact_match_stderr,strict-match": 0.19999999999999998,
-      "exact_match,flexible-extract": 0.8,
-      "exact_match_stderr,flexible-extract": 0.19999999999999998
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "decode_mode": "varlen"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 5
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,decode_mode=varlen",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 5.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768560865.8744533,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            3887.958\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1993968.501242861,
-  "end_time": 1994032.223343569,
-  "total_evaluation_time_seconds": "63.722100708168"
-}
\ No newline at end of file
diff --git a/benchmark_results/marlin_w8a16_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T13-13-39.902007.json b/benchmark_results/marlin_w8a16_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T13-13-39.902007.json
deleted file mode 100644
index 12bb039..0000000
--- a/benchmark_results/marlin_w8a16_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T13-13-39.902007.json
+++ /dev/null
@@ -1,176 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.6,
-      "exact_match_stderr,strict-match": 0.1632993161855452,
-      "exact_match,flexible-extract": 0.6,
-      "exact_match_stderr,flexible-extract": 0.1632993161855452
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "decode_mode": "varlen"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,decode_mode=varlen",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768569026.266297,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1403.994\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 2002128.910876827,
-  "end_time": 2002324.122048688,
-  "total_evaluation_time_seconds": "195.21117186080664"
-}
\ No newline at end of file
diff --git a/benchmark_results/marlin_w8a16_fp8kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T13-17-27.453222.json b/benchmark_results/marlin_w8a16_fp8kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T13-17-27.453222.json
deleted file mode 100644
index 1e739de..0000000
--- a/benchmark_results/marlin_w8a16_fp8kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T13-17-27.453222.json
+++ /dev/null
@@ -1,176 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.6,
-      "exact_match_stderr,strict-match": 0.1632993161855452,
-      "exact_match,flexible-extract": 0.6,
-      "exact_match_stderr,flexible-extract": 0.1632993161855452
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "decode_mode": "varlen"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,decode_mode=varlen",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768569254.4509277,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1554.063\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 2002357.032112231,
-  "end_time": 2002551.673273827,
-  "total_evaluation_time_seconds": "194.64116159593686"
-}
\ No newline at end of file
diff --git a/benchmark_results/w4a16_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T11-53-35.800494.json b/benchmark_results/w4a16_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T11-53-35.800494.json
deleted file mode 100644
index 44433b9..0000000
--- a/benchmark_results/w4a16_bf16kv/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T11-53-35.800494.json
+++ /dev/null
@@ -1,176 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.8,
-      "exact_match_stderr,strict-match": 0.19999999999999998,
-      "exact_match,flexible-extract": 0.8,
-      "exact_match_stderr,flexible-extract": 0.19999999999999998
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "decode_mode": "varlen"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 5
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,decode_mode=varlen",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 5.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768564227.2826512,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1557.566\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1997329.915016455,
-  "end_time": 1997520.020547304,
-  "total_evaluation_time_seconds": "190.10553084895946"
-}
\ No newline at end of file
diff --git a/benchmark_results/w4a16_bf16kv_retest/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T12-11-26.946690.json b/benchmark_results/w4a16_bf16kv_retest/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T12-11-26.946690.json
deleted file mode 100644
index 9a04a3f..0000000
--- a/benchmark_results/w4a16_bf16kv_retest/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-16T12-11-26.946690.json
+++ /dev/null
@@ -1,176 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.6,
-      "exact_match_stderr,strict-match": 0.1632993161855452,
-      "exact_match,flexible-extract": 0.6,
-      "exact_match_stderr,flexible-extract": 0.1632993161855452
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.7,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 4096,
-        "max_num_seqs": 128,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "decode_mode": "varlen"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 10
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.7,max_model_len=2048,max_num_batched_tokens=4096,max_num_seqs=128,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,decode_mode=varlen",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 10.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768565293.9662197,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 10.1.243\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1557.601\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1998396.598309235,
-  "end_time": 1998591.166686513,
-  "total_evaluation_time_seconds": "194.56837727804668"
-}
\ No newline at end of file
diff --git a/benchmark_results/w8a8_bf16kv_varlen_gpu1/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T11-03-50.486126.json b/benchmark_results/w8a8_bf16kv_varlen_gpu1/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T11-03-50.486126.json
deleted file mode 100644
index 660ce35..0000000
--- a/benchmark_results/w8a8_bf16kv_varlen_gpu1/__data1__ckpts__Dream-org__Dream-v0-Base-7B/results_2026-01-15T11-03-50.486126.json
+++ /dev/null
@@ -1,181 +0,0 @@
-{
-  "results": {
-    "gsm8k": {
-      "alias": "gsm8k",
-      "exact_match,strict-match": 0.65,
-      "exact_match_stderr,strict-match": 0.1094243309804831,
-      "exact_match,flexible-extract": 0.7,
-      "exact_match_stderr,flexible-extract": 0.10513149660756933
-    }
-  },
-  "group_subtasks": {
-    "gsm8k": []
-  },
-  "configs": {
-    "gsm8k": {
-      "task": "gsm8k",
-      "tag": [
-        "math_word_problems"
-      ],
-      "dataset_path": "gsm8k",
-      "dataset_name": "main",
-      "training_split": "train",
-      "test_split": "test",
-      "fewshot_split": "train",
-      "doc_to_text": "Question: {{question}}\nAnswer:",
-      "doc_to_target": "{{answer}}",
-      "unsafe_code": false,
-      "description": "",
-      "target_delimiter": " ",
-      "fewshot_delimiter": "\n\n",
-      "num_fewshot": 5,
-      "metric_list": [
-        {
-          "metric": "exact_match",
-          "aggregation": "mean",
-          "higher_is_better": true,
-          "ignore_case": true,
-          "ignore_punctuation": false,
-          "regexes_to_ignore": [
-            ",",
-            "\\$",
-            "(?s).*#### ",
-            "\\.$"
-          ]
-        }
-      ],
-      "output_type": "generate_until",
-      "generation_kwargs": {
-        "until": [
-          "Question:",
-          "</s>",
-          "<|im_end|>"
-        ],
-        "do_sample": false,
-        "temperature": 0.0
-      },
-      "repeats": 1,
-      "filter_list": [
-        {
-          "name": "strict-match",
-          "filter": [
-            {
-              "function": "regex",
-              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        },
-        {
-          "name": "flexible-extract",
-          "filter": [
-            {
-              "function": "regex",
-              "group_select": -1,
-              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-            },
-            {
-              "function": "take_first"
-            }
-          ]
-        }
-      ],
-      "should_decontaminate": false,
-      "metadata": {
-        "version": 3.0,
-        "pretrained": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-        "model_name": "dream",
-        "decoding_strategy": "d2f",
-        "mask_token_id": 151666,
-        "tensor_parallel_size": 1,
-        "data_parallel_size": 1,
-        "gpu_memory_utilization": 0.5,
-        "max_model_len": 2048,
-        "max_num_batched_tokens": 2048,
-        "max_num_seqs": 64,
-        "temperature": 0.0,
-        "max_new_tokens": 512,
-        "use_lora": false,
-        "enforce_eager": true,
-        "kv_cache_layout": "unified",
-        "accept_threshold": 0.9,
-        "complete_threshold": 0.95,
-        "add_new_block_threshold": 0.1,
-        "diffusion_block_size": 32,
-        "wait_ready": true,
-        "kv_cache_dtype": "bf16",
-        "decode_mode": "varlen",
-        "linear_attn_weight_dtype": "int8",
-        "linear_mlp_weight_dtype": "int8",
-        "linear_attn_act_dtype": "int8",
-        "linear_mlp_act_dtype": "int8"
-      }
-    }
-  },
-  "versions": {
-    "gsm8k": 3.0
-  },
-  "n-shot": {
-    "gsm8k": 5
-  },
-  "higher_is_better": {
-    "gsm8k": {
-      "exact_match": true
-    }
-  },
-  "n-samples": {
-    "gsm8k": {
-      "original": 1319,
-      "effective": 20
-    }
-  },
-  "config": {
-    "model": "diffulex",
-    "model_args": "pretrained=/data1/ckpts/Dream-org/Dream-v0-Base-7B,model_name=dream,decoding_strategy=d2f,mask_token_id=151666,tensor_parallel_size=1,data_parallel_size=1,gpu_memory_utilization=0.5,max_model_len=2048,max_num_batched_tokens=2048,max_num_seqs=64,temperature=0.0,max_new_tokens=512,use_lora=False,enforce_eager=True,kv_cache_layout=unified,accept_threshold=0.9,complete_threshold=0.95,add_new_block_threshold=0.1,diffusion_block_size=32,wait_ready=True,kv_cache_dtype=bf16,decode_mode=varlen,linear_attn_weight_dtype=int8,linear_mlp_weight_dtype=int8,linear_attn_act_dtype=int8,linear_mlp_act_dtype=int8",
-    "batch_size": "1",
-    "batch_sizes": [],
-    "device": null,
-    "use_cache": null,
-    "limit": 20.0,
-    "bootstrap_iters": 100000,
-    "gen_kwargs": null,
-    "random_seed": 0,
-    "numpy_seed": 1234,
-    "torch_seed": 1234,
-    "fewshot_seed": 1234
-  },
-  "git_hash": "426b314",
-  "date": 1768474154.0957432,
-  "pretty_env_info": "PyTorch version: 2.9.1+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nClang version: Could not collect\nCMake version: version 3.27.7\nLibc version: glibc-2.31\n\nPython version: 3.12.12 (main, Oct 14 2025, 21:25:31) [Clang 20.1.4 ] (64-bit runtime)\nPython platform: Linux-5.4.0-216-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: 12.2.91\nCUDA_MODULE_LOADING set to: \nGPU models and configuration: \nGPU 0: NVIDIA GeForce RTX 4090\nGPU 1: NVIDIA GeForce RTX 4090\nGPU 2: NVIDIA GeForce RTX 4090\nGPU 3: NVIDIA GeForce RTX 4090\nGPU 4: NVIDIA GeForce RTX 4090\nGPU 5: NVIDIA GeForce RTX 4090\nGPU 6: NVIDIA GeForce RTX 4090\nGPU 7: NVIDIA GeForce RTX 4090\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nIs XPU available: False\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      46 bits physical, 57 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 2\nCore(s) per socket:                 32\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              17\nModel name:                         AMD EPYC 9334 32-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            1557.564\nCPU max MHz:                        2700.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           5391.92\nVirtualization:                     AMD-V\nL1d cache:                          2 MiB\nL1i cache:                          2 MiB\nL2 cache:                           64 MiB\nL3 cache:                           256 MiB\nNUMA node0 CPU(s):                  0-31,64-95\nNUMA node1 CPU(s):                  32-63,96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Enhanced / Automatic IBRS; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp ibrs_enhanced vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid overflow_recov succor smca flush_l1d sme sev sev_es\n\nVersions of relevant libraries:\n[pip3] numpy==2.3.5\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cudnn-frontend==1.16.0\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.5\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.9.1\n[pip3] torch_c_dlpack_ext==0.1.4\n[pip3] torchaudio==2.9.0\n[pip3] torchvision==0.24.0\n[pip3] triton==3.5.1\n[conda] Could not collect",
-  "transformers_version": "4.57.3",
-  "lm_eval_version": "0.4.9.2",
-  "upper_git_hash": null,
-  "tokenizer_pad_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_eos_token": [
-    "<|endoftext|>",
-    "151643"
-  ],
-  "tokenizer_bos_token": [
-    "<|beginoftext|>",
-    "151665"
-  ],
-  "eot_token_id": null,
-  "max_length": 2048,
-  "task_hashes": {},
-  "model_source": "diffulex",
-  "model_name": "/data1/ckpts/Dream-org/Dream-v0-Base-7B",
-  "model_name_sanitized": "__data1__ckpts__Dream-org__Dream-v0-Base-7B",
-  "system_instruction": null,
-  "system_instruction_sha": null,
-  "fewshot_as_multiturn": false,
-  "chat_template": null,
-  "chat_template_sha": null,
-  "start_time": 1907256.733360387,
-  "end_time": 1908134.706131824,
-  "total_evaluation_time_seconds": "877.9727714371402"
-}
\ No newline at end of file

From a594135a0b85640d65d6dac24fe2ab322c7779c5 Mon Sep 17 00:00:00 2001
From: luozixin2 <lzx@diffulex.local>
Date: Sun, 18 Jan 2026 06:40:55 +0000
Subject: [PATCH 03/10] =?UTF-8?q?=E5=8D=87=E7=BA=A7=20quantize=5Fmodel.py?=
 =?UTF-8?q?=20=E4=B8=BA=E7=9C=9F=E6=AD=A3=E7=9A=84=20GPTQ/AWQ=20=E9=87=8F?=
 =?UTF-8?q?=E5=8C=96=E8=B7=AF=E5=BE=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 添加 quant-method=auto 支持：使用 auto-gptq / awq 进行真正的校准量化
- 添加校准数据参数：--calib-text-file, --calib-num-samples, --calib-seq-len 等
- 实现 _export_autogptq_to_vllm_weights：从 auto-gptq 量化模型中导出 vLLM 格式权重
- 实现 _export_awq_to_vllm_weights：从 awq 量化模型中导出 vLLM 格式权重
- 保留 quant-method=simple 旧实现作为后向兼容
- 修复 loader.py 中 gptq_marlin scales 的 shape 推理和 TP sharding 逻辑
- 修复 linear_gptq_marlin_w4a16.py 移除不必要的 bf16->fp16 转换
---
 diffulex/utils/loader.py                      |  32 +-
 diffulex/utils/quantization/quantize_model.py | 568 ++++++++++++++----
 .../strategies/linear_gptq_marlin_w4a16.py    |   6 +-
 3 files changed, 497 insertions(+), 109 deletions(-)

diff --git a/diffulex/utils/loader.py b/diffulex/utils/loader.py
index fb608f9..622e7e2 100755
--- a/diffulex/utils/loader.py
+++ b/diffulex/utils/loader.py
@@ -403,10 +403,14 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
                 group_size = int(ckpt_group_size)
             else:
                 if is_gptq_marlin_ckpt and len(scales.shape) == 2 and int(scales.shape[0]) > 0:
-                    # marlin scales often use first dim = 2 * num_groups
-                    num_groups = int(scales.shape[0]) // 2
+                    # vLLM marlin_permute_scales keeps shape [num_groups, N] for most cases.
+                    # Some older/alternate layouts may use [2*num_groups, N/2].
+                    num_groups = int(scales.shape[0])
                     if num_groups > 0 and in_features % num_groups == 0:
                         group_size = in_features // num_groups
+                    elif num_groups % 2 == 0 and (in_features % (num_groups // 2)) == 0:
+                        # Fallback for legacy 2*num_groups layouts.
+                        group_size = in_features // (num_groups // 2)
                 else:
                     num_groups = int(qzeros.shape[0]) if getattr(qzeros, "numel", lambda: 1)() > 0 else 0
                     if num_groups > 0 and in_features % num_groups == 0:
@@ -544,8 +548,28 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
                             q_start = in_start // 16
                             q_end = in_end // 16
                             qweight = qweight[q_start:q_end, :]
-                            # scales first dim is typically 2*num_groups
-                            scales = scales[(2 * g_start):(2 * g_end), :]
+                            # Shard scales on group dimension (K/group).
+                            # vLLM marlin_permute_scales typically returns [num_groups, N].
+                            group_size_norm = in_features if group_size == -1 else group_size
+                            expected_num_groups = in_features // group_size_norm if group_size_norm > 0 else 0
+                            if expected_num_groups <= 0:
+                                print(
+                                    f"Warning: invalid expected_num_groups={expected_num_groups} for {module_name}. Skipping."
+                                )
+                                skipped += 1
+                                continue
+                            if int(scales.shape[0]) == expected_num_groups:
+                                scales = scales[g_start:g_end, :]
+                            elif int(scales.shape[0]) == 2 * expected_num_groups:
+                                # Legacy/alternate layout: [2*num_groups, N/2]
+                                scales = scales[(2 * g_start):(2 * g_end), :]
+                            else:
+                                print(
+                                    f"Warning: unexpected gptq_marlin scales.shape[0]={int(scales.shape[0])} "
+                                    f"(expected {expected_num_groups} or {2*expected_num_groups}) for {module_name}. Skipping."
+                                )
+                                skipped += 1
+                                continue
                             if g_idx is not None and getattr(g_idx, "numel", lambda: 1)() > 0:
                                 g_idx = g_idx[in_start:in_end]
                             in_features = in_per
diff --git a/diffulex/utils/quantization/quantize_model.py b/diffulex/utils/quantization/quantize_model.py
index bd77977..4c004c5 100644
--- a/diffulex/utils/quantization/quantize_model.py
+++ b/diffulex/utils/quantization/quantize_model.py
@@ -12,7 +12,15 @@
         --output-path /path/to/output \
         --quant-format gptq_marlin \
         --group-size 128 \
-        --bits 4
+        --bits 4 \
+        --quant-method auto \
+        --calib-text-file /path/to/calib.txt \
+        --calib-num-samples 128 \
+        --calib-seq-len 512
+
+说明:
+- `quant-method=simple`：沿用当前“直接分组量化/舍入”的旧实现（不需要校准数据，不是真 GPTQ/AWQ）。
+- `quant-method=auto`：使用 `auto-gptq` / `awq(autoawq)` 做真正的校准量化，然后导出为 vLLM/Diffulex 可加载的权重格式。
 """
 
 from __future__ import annotations
@@ -20,6 +28,7 @@
 import argparse
 import os
 import json
+import random
 from pathlib import Path
 from typing import Optional
 
@@ -37,7 +46,7 @@
 if str(_REPO_ROOT) not in sys.path:
     sys.path.insert(0, str(_REPO_ROOT))
 
-from transformers import AutoConfig, AutoModelForCausalLM
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 from safetensors import safe_open
 from glob import glob
 
@@ -72,6 +81,69 @@ def _require_vllm_marlin():
     return ops, marlin_permute_scales
 
 
+def _require_auto_gptq():
+    try:
+        from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig  # type: ignore
+    except Exception as e:  # pragma: no cover
+        raise RuntimeError(
+            "未能导入 auto-gptq。请确认已在当前 .venv 安装（例如：BUILD_CUDA_EXT=0 pip install auto-gptq）。"
+        ) from e
+    return AutoGPTQForCausalLM, BaseQuantizeConfig
+
+
+def _require_awq():
+    try:
+        from awq import AutoAWQForCausalLM  # type: ignore
+    except Exception as e:  # pragma: no cover
+        raise RuntimeError(
+            "未能导入 awq（autoawq 的导入名是 `awq`）。"
+        ) from e
+    return AutoAWQForCausalLM
+
+
+def _load_calib_texts(
+    calib_text_file: str, *, num_samples: int, seed: int
+) -> list[str]:
+    p = Path(calib_text_file)
+    if not p.exists():
+        raise FileNotFoundError(f"calib_text_file 不存在: {calib_text_file}")
+    lines = [ln.strip() for ln in p.read_text(encoding="utf-8", errors="ignore").splitlines()]
+    lines = [ln for ln in lines if ln]
+    if not lines:
+        raise ValueError(f"calib_text_file 为空: {calib_text_file}")
+    if num_samples <= 0:
+        raise ValueError(f"calib_num_samples 必须 > 0, got {num_samples}")
+    if len(lines) <= num_samples:
+        return lines[:num_samples]
+    rng = random.Random(seed)
+    return rng.sample(lines, k=num_samples)
+
+
+def _build_autogptq_examples(
+    tokenizer, texts: list[str], *, seq_len: int
+) -> list[dict[str, torch.Tensor]]:
+    if seq_len <= 0:
+        raise ValueError(f"calib_seq_len 必须 > 0, got {seq_len}")
+
+    # AutoGPTQ 会自行 collate/pad；这里用 fixed max_length 保持输入一致。
+    examples: list[dict[str, torch.Tensor]] = []
+    for t in texts:
+        enc = tokenizer(
+            t,
+            return_tensors="pt",
+            truncation=True,
+            max_length=seq_len,
+            padding="max_length",
+        )
+        examples.append(
+            {
+                "input_ids": enc["input_ids"],
+                "attention_mask": enc.get("attention_mask", torch.ones_like(enc["input_ids"])),
+            }
+        )
+    return examples
+
+
 def _quantize_to_vllm_gptq(
     weight: torch.Tensor, *, group_size: int, bits: int, use_v2_format: bool = False
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -218,6 +290,129 @@ def _quantize_to_vllm_awq(
     return qweight, qzeros, scales
 
 
+@torch.inference_mode()
+def _export_autogptq_to_vllm_weights(
+    *,
+    gptq_base_model: nn.Module,
+    quant_format: str,
+    target_modules: Optional[list[str]],
+    desc_act: bool,
+    bits: int,
+    group_size: int,
+) -> dict[str, torch.Tensor]:
+    """
+    从 auto-gptq 的量化后模型中抽取 qweight/qzeros/scales/g_idx，并按 vLLM/Diffulex 的命名导出。
+    - quant_format == "gptq": 直接导出 QuantLinear 的 buffers。
+    - quant_format == "gptq_marlin": 在导出前使用 vLLM Marlin 的 repack/permute，且导出空 qzeros/g_idx。
+    """
+    quantized_weights: dict[str, torch.Tensor] = {}
+
+    if quant_format not in ("gptq", "gptq_marlin"):
+        raise ValueError(f"Unexpected quant_format for auto-gptq export: {quant_format}")
+
+    if quant_format == "gptq_marlin":
+        if not torch.cuda.is_available():
+            raise RuntimeError("导出 gptq_marlin 需要 CUDA（vLLM Marlin repack 为 CUDA op）。")
+        ops, marlin_permute_scales = _require_vllm_marlin()
+
+    for module_name, module in gptq_base_model.named_modules():
+        # AutoGPTQ 的 QuantLinear（triton/cuda）会有这些 buffer
+        if not (hasattr(module, "qweight") and hasattr(module, "qzeros") and hasattr(module, "scales")):
+            continue
+
+        # 过滤：保持和旧脚本一致，默认不量化 lm_head
+        if "lm_head" in module_name:
+            continue
+        if target_modules and not any(t in module_name for t in target_modules):
+            continue
+
+        qweight = getattr(module, "qweight")
+        qzeros = getattr(module, "qzeros")
+        scales = getattr(module, "scales")
+        g_idx = getattr(module, "g_idx", None)
+
+        if not isinstance(qweight, torch.Tensor) or not isinstance(qzeros, torch.Tensor) or not isinstance(scales, torch.Tensor):
+            continue
+
+        if quant_format == "gptq":
+            quantized_weights[f"{module_name}.qweight"] = qweight.detach().cpu().contiguous()
+            quantized_weights[f"{module_name}.qzeros"] = qzeros.detach().cpu().contiguous()
+            quantized_weights[f"{module_name}.scales"] = scales.detach().cpu().contiguous()
+            if desc_act and isinstance(g_idx, torch.Tensor) and g_idx.numel() > 0:
+                quantized_weights[f"{module_name}.g_idx"] = g_idx.detach().to(dtype=torch.int32).cpu().contiguous()
+            else:
+                quantized_weights[f"{module_name}.g_idx"] = torch.empty((0,), dtype=torch.int32)
+            continue
+
+        # gptq_marlin 导出：用 vLLM 的 repack/permute 变成 Marlin-ready layout
+        in_features = int(getattr(module, "infeatures", 0))
+        out_features = int(getattr(module, "outfeatures", 0))
+        if in_features <= 0 or out_features <= 0:
+            # fallback：从张量形状推断（qweight shape: [K/pack, N]）
+            out_features = int(qweight.shape[1])
+            pack = 32 // bits
+            in_features = int(qweight.shape[0] * pack)
+
+        group_size_norm = in_features if group_size == -1 else group_size
+        empty_perm = torch.empty((0,), dtype=torch.int32, device="cuda")
+
+        qweight_cuda = qweight.contiguous().to(device="cuda")
+        scales_cuda = scales.contiguous().to(device="cuda", dtype=torch.float16)
+
+        marlin_qweight = ops.gptq_marlin_repack(
+            qweight_cuda,
+            perm=empty_perm,
+            size_k=in_features,
+            size_n=out_features,
+            num_bits=bits,
+            is_a_8bit=(bits == 8),
+        ).contiguous()
+        marlin_scales = marlin_permute_scales(
+            scales_cuda,
+            size_k=in_features,
+            size_n=out_features,
+            group_size=group_size_norm,
+            is_a_8bit=(bits == 8),
+        ).contiguous()
+
+        quantized_weights[f"{module_name}.qweight"] = marlin_qweight.detach().cpu().contiguous()
+        quantized_weights[f"{module_name}.qzeros"] = torch.empty((0,), dtype=torch.int32)
+        quantized_weights[f"{module_name}.scales"] = marlin_scales.detach().cpu().contiguous()
+        quantized_weights[f"{module_name}.g_idx"] = torch.empty((0,), dtype=torch.int32)
+
+    return quantized_weights
+
+
+@torch.inference_mode()
+def _export_awq_to_vllm_weights(
+    *,
+    awq_base_model: nn.Module,
+    target_modules: Optional[list[str]],
+) -> dict[str, torch.Tensor]:
+    """
+    从 awq(pack 后)模型中抽取 qweight/qzeros/scales，并按 vLLM/Diffulex 的命名导出。
+    """
+    quantized_weights: dict[str, torch.Tensor] = {}
+    for module_name, module in awq_base_model.named_modules():
+        if not (hasattr(module, "qweight") and hasattr(module, "qzeros") and hasattr(module, "scales")):
+            continue
+        if "lm_head" in module_name:
+            continue
+        if target_modules and not any(t in module_name for t in target_modules):
+            continue
+
+        qweight = getattr(module, "qweight")
+        qzeros = getattr(module, "qzeros")
+        scales = getattr(module, "scales")
+        if not isinstance(qweight, torch.Tensor) or not isinstance(qzeros, torch.Tensor) or not isinstance(scales, torch.Tensor):
+            continue
+
+        quantized_weights[f"{module_name}.qweight"] = qweight.detach().cpu().contiguous()
+        quantized_weights[f"{module_name}.qzeros"] = qzeros.detach().cpu().contiguous()
+        quantized_weights[f"{module_name}.scales"] = scales.detach().cpu().contiguous()
+    return quantized_weights
+
+
 def quantize_model(
     model_path: str,
     output_path: str,
@@ -226,6 +421,18 @@ def quantize_model(
     bits: int = 4,
     target_modules: Optional[list[str]] = None,
     device: str = "cpu",
+    quant_method: str = "auto",
+    calib_text_file: Optional[str] = None,
+    calib_num_samples: int = 128,
+    calib_seq_len: int = 512,
+    calib_batch_size: int = 1,
+    calib_seed: int = 0,
+    # GPTQ config
+    desc_act: bool = False,
+    sym: bool = True,
+    damp_percent: float = 0.01,
+    true_sequential: bool = True,
+    use_triton: bool = True,
 ) -> None:
     """Quantize model weights to GPTQ/AWQ format.
     
@@ -238,117 +445,209 @@ def quantize_model(
         target_modules: List of module name patterns to quantize (e.g., ["q_proj", "k_proj"]).
                        If None, quantizes all linear layers.
         device: Device to use for quantization ("cpu" or "cuda")
+        quant_method: "auto"（真 GPTQ/AWQ，需校准数据）或 "simple"（旧实现，无校准）
+        calib_text_file: 校准文本文件（每行一条样本）
     """
     if quant_format not in ["gptq", "gptq_marlin", "awq"]:
         raise ValueError(
             f"Unsupported quant_format: {quant_format}. Must be 'gptq', 'gptq_marlin' or 'awq'"
         )
+    if quant_method not in ["auto", "simple"]:
+        raise ValueError("quant_method must be 'auto' or 'simple'")
+
+    # Marlin GPTQ 强约束：对称量化 + 不使用 act-order
+    if quant_format == "gptq_marlin":
+        desc_act = False
+        sym = True
     
     output_path = Path(output_path)
     output_path.mkdir(parents=True, exist_ok=True)
     
-    # Load model config
-    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-    
-    # Load model weights from safetensors files
-    safetensors_files = list(glob(os.path.join(model_path, "*.safetensors")))
-    if not safetensors_files:
-        raise ValueError(f"No safetensors files found in {model_path}")
-    
-    print(f"Found {len(safetensors_files)} safetensors files")
-    
-    # Collect all weight names
-    all_weight_keys = []
-    for file in safetensors_files:
-        with safe_open(file, "pt", device) as f:
-            all_weight_keys.extend(f.keys())
-    
-    # Filter to linear layer weights only (exclude biases and non-linear layers)
-    linear_weight_keys = []
-    for key in all_weight_keys:
-        # Skip biases, layer norms, embeddings, etc.
-        # Note: lm_head is excluded because ParallelLMHead doesn't support offline quantization yet
-        if any(skip in key for skip in [".bias", ".norm", ".embed", ".lm_head"]):
-            continue
-        # Only process weight parameters
-        if not key.endswith(".weight"):
-            continue
-        # Check if target_modules filter applies
-        if target_modules:
-            if not any(target in key for target in target_modules):
-                continue
-        linear_weight_keys.append(key)
-    
-    print(f"Found {len(linear_weight_keys)} linear layer weights to quantize")
-    
-    # Quantize each linear layer
-    quantized_weights = {}
+    # Load model config (for tokenizer special tokens, etc.)
+    _ = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+
+    quantized_weights: dict[str, torch.Tensor] = {}
     metadata = {
         "quant_format": quant_format,
+        "quant_method": quant_method,
         "group_size": group_size,
         "bits": bits,
         "quantized_modules": [],
     }
-    
-    for key in tqdm(linear_weight_keys, desc="Quantizing weights"):
-        # Load weight from safetensors
-        weight = None
-        source_file = None
-        for file in safetensors_files:
-            with safe_open(file, "pt", device) as f:
-                if key in f.keys():
-                    weight = f.get_tensor(key)
-                    source_file = file
-                    break
-        
-        if weight is None:
-            print(f"Warning: Could not load weight for {key}")
-            continue
-        
-        # Skip if weight is not 2D (not a linear layer weight)
-        if weight.dim() != 2:
-            print(f"Skipping {key}: not a 2D weight (shape: {weight.shape})")
-            continue
-        
-        out_features, in_features = weight.shape
-        
-        # Convert to float32 for quantization
-        weight_fp32 = weight.to(torch.float32).to(device)
-        
-        # Quantize
-        prefix = key[:-7]  # Remove ".weight"
-        if quant_format == "gptq":
-            qweight, qzeros, scales, g_idx = _quantize_to_vllm_gptq(
-                weight_fp32, group_size=group_size, bits=bits, use_v2_format=False
+
+    # ----------------------------
+    # 真 GPTQ/AWQ（需要校准数据）
+    # ----------------------------
+    if quant_method == "auto":
+        if calib_text_file is None:
+            raise ValueError("quant_method=auto 需要提供 --calib-text-file")
+
+        texts = _load_calib_texts(calib_text_file, num_samples=calib_num_samples, seed=calib_seed)
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=True)
+        if tokenizer.pad_token_id is None:
+            tokenizer.pad_token = tokenizer.eos_token
+
+        if quant_format in ("gptq", "gptq_marlin"):
+            if quant_format == "gptq_marlin" and device != "cuda":
+                raise ValueError("导出 gptq_marlin 需要 --device cuda")
+
+            AutoGPTQForCausalLM, BaseQuantizeConfig = _require_auto_gptq()
+            examples = _build_autogptq_examples(tokenizer, texts, seq_len=calib_seq_len)
+
+            qcfg = BaseQuantizeConfig(
+                bits=int(bits),
+                group_size=int(group_size),
+                damp_percent=float(damp_percent),
+                desc_act=bool(desc_act),
+                sym=bool(sym),
+                true_sequential=bool(true_sequential),
             )
-        elif quant_format == "gptq_marlin":
-            qweight, qzeros, scales, g_idx = _quantize_to_vllm_gptq_marlin(
-                weight_fp32, group_size=group_size, bits=bits
+
+            model_init_kwargs = {
+                "trust_remote_code": True,
+            }
+            # 让 AutoGPTQ 自己用 accelerate 做 device_map；CPU 模式下走默认加载。
+            if device == "cuda":
+                model_init_kwargs["device_map"] = "auto"
+                model_init_kwargs["torch_dtype"] = torch.float16
+
+            gptq_model = AutoGPTQForCausalLM.from_pretrained(
+                model_path,
+                qcfg,
+                **model_init_kwargs,
+            )
+            gptq_model.quantize(
+                examples,
+                batch_size=int(calib_batch_size),
+                use_triton=bool(use_triton),
+                cache_examples_on_gpu=(device == "cuda"),
             )
-            quantized_weights[f"{prefix}.qweight"] = qweight.cpu()
-            quantized_weights[f"{prefix}.qzeros"] = qzeros.cpu()
-            quantized_weights[f"{prefix}.scales"] = scales.cpu()
-            # Keep g_idx key for compatibility (often empty when desc_act=False).
-            quantized_weights[f"{prefix}.g_idx"] = g_idx.cpu()
+
+            quantized_weights = _export_autogptq_to_vllm_weights(
+                gptq_base_model=gptq_model.model,
+                quant_format=quant_format,
+                target_modules=target_modules,
+                desc_act=bool(desc_act),
+                bits=int(bits),
+                group_size=int(group_size),
+            )
+
         else:  # awq
-            qweight, qzeros, scales = _quantize_to_vllm_awq(
-                weight_fp32, group_size=group_size, bits=bits
+            if bits != 4:
+                raise ValueError(f"AWQ 目前仅支持 4-bit，当前 bits={bits}")
+            AutoAWQForCausalLM = _require_awq()
+
+            awq_model = AutoAWQForCausalLM.from_pretrained(
+                model_path,
+                trust_remote_code=True,
+                safetensors=True,
+                device_map="auto" if device == "cuda" else None,
+                torch_dtype="auto",
+            )
+
+            awq_model.quantize(
+                tokenizer=tokenizer,
+                quant_config={
+                    "zero_point": True,
+                    "q_group_size": int(group_size),
+                    "w_bit": int(bits),
+                    "version": "GEMM",
+                },
+                calib_data=texts,
+                max_calib_samples=int(calib_num_samples),
+                max_calib_seq_len=int(calib_seq_len),
+            )
+            awq_model.pack()
+
+            quantized_weights = _export_awq_to_vllm_weights(
+                awq_base_model=awq_model.model,
+                target_modules=target_modules,
+            )
+
+    # ----------------------------
+    # 旧实现（无校准，不是真 GPTQ/AWQ）
+    # ----------------------------
+    else:
+        safetensors_files = list(glob(os.path.join(model_path, "*.safetensors")))
+        if not safetensors_files:
+            raise ValueError(f"No safetensors files found in {model_path}")
+
+        print(f"Found {len(safetensors_files)} safetensors files")
+
+        all_weight_keys: list[str] = []
+        for file in safetensors_files:
+            with safe_open(file, "pt", device) as f:
+                all_weight_keys.extend(f.keys())
+
+        linear_weight_keys: list[str] = []
+        for key in all_weight_keys:
+            if any(skip in key for skip in [".bias", ".norm", ".embed", ".lm_head"]):
+                continue
+            if not key.endswith(".weight"):
+                continue
+            if target_modules and not any(target in key for target in target_modules):
+                continue
+            linear_weight_keys.append(key)
+
+        print(f"Found {len(linear_weight_keys)} linear layer weights to quantize")
+
+        for key in tqdm(linear_weight_keys, desc="Quantizing weights (simple)"):
+            weight = None
+            for file in safetensors_files:
+                with safe_open(file, "pt", device) as f:
+                    if key in f.keys():
+                        weight = f.get_tensor(key)
+                        break
+
+            if weight is None:
+                print(f"Warning: Could not load weight for {key}")
+                continue
+            if weight.dim() != 2:
+                print(f"Skipping {key}: not a 2D weight (shape: {weight.shape})")
+                continue
+
+            out_features, in_features = weight.shape
+            weight_fp32 = weight.to(torch.float32).to(device)
+            prefix = key[:-7]  # Remove ".weight"
+
+            if quant_format == "gptq":
+                qweight, qzeros, scales, g_idx = _quantize_to_vllm_gptq(
+                    weight_fp32, group_size=group_size, bits=bits, use_v2_format=False
+                )
+                quantized_weights[f"{prefix}.qweight"] = qweight.cpu()
+                quantized_weights[f"{prefix}.qzeros"] = qzeros.cpu()
+                quantized_weights[f"{prefix}.scales"] = scales.cpu()
+                quantized_weights[f"{prefix}.g_idx"] = g_idx.cpu()
+
+            elif quant_format == "gptq_marlin":
+                qweight, qzeros, scales, g_idx = _quantize_to_vllm_gptq_marlin(
+                    weight_fp32, group_size=group_size, bits=bits
+                )
+                quantized_weights[f"{prefix}.qweight"] = qweight.cpu()
+                quantized_weights[f"{prefix}.qzeros"] = qzeros.cpu()
+                quantized_weights[f"{prefix}.scales"] = scales.cpu()
+                quantized_weights[f"{prefix}.g_idx"] = g_idx.cpu()
+
+            else:  # awq
+                qweight, qzeros, scales = _quantize_to_vllm_awq(
+                    weight_fp32, group_size=group_size, bits=bits
+                )
+                quantized_weights[f"{prefix}.qweight"] = qweight.cpu()
+                quantized_weights[f"{prefix}.qzeros"] = qzeros.cpu()
+                quantized_weights[f"{prefix}.scales"] = scales.cpu()
+
+            metadata["quantized_modules"].append(
+                {
+                    "name": prefix,
+                    "out_features": int(out_features),
+                    "in_features": int(in_features),
+                    "group_size": group_size,
+                    "bits": bits,
+                }
             )
-            quantized_weights[f"{prefix}.qweight"] = qweight.cpu()
-            quantized_weights[f"{prefix}.qzeros"] = qzeros.cpu()
-            quantized_weights[f"{prefix}.scales"] = scales.cpu()
-        
-        metadata["quantized_modules"].append({
-            "name": prefix,
-            "out_features": int(out_features),
-            "in_features": int(in_features),
-            "group_size": group_size,
-            "bits": bits,
-        })
-        
-        # Clear GPU cache if using CUDA
-        if device == "cuda":
-            torch.cuda.empty_cache()
+
+            if device == "cuda":
+                torch.cuda.empty_cache()
     
     # Copy all model files (config, tokenizer, etc.) to output directory
     import shutil
@@ -379,22 +678,34 @@ def quantize_model(
     with open(metadata_file, "w") as f:
         json.dump(metadata, f, indent=2)
 
-    # vLLM GPTQ/GPTQ-Marlin 会读取 quantize_config.json
-    # - gptq_marlin: 需要 sym/desc_act 等字段用于识别并选择 Marlin kernel
-    if quant_format == "gptq_marlin":
+    # vLLM/Diffulex 会读取 quantize_config.json 识别量化类型与超参
+    if quant_format in ("gptq", "gptq_marlin", "awq"):
+        if quant_format == "gptq_marlin":
+            cfg_desc_act = False
+            cfg_sym = True
+            cfg_ckpt = "gptq_marlin"
+        elif quant_format == "gptq":
+            cfg_desc_act = bool(desc_act)
+            cfg_sym = bool(sym)
+            cfg_ckpt = "gptq"
+        else:  # awq
+            cfg_desc_act = False
+            cfg_sym = False
+            cfg_ckpt = "awq"
+
         quantize_cfg = {
             "bits": int(bits),
             "group_size": int(group_size),
-            "desc_act": False,
-            "sym": True,
+            "desc_act": bool(cfg_desc_act),
+            "sym": bool(cfg_sym),
             "lm_head": False,
-            "checkpoint_format": "gptq_marlin",
+            "checkpoint_format": cfg_ckpt,
         }
-        with open(output_path / "quantize_config.json", "w") as f:
+        with open(output_path / "quantize_config.json", "w", encoding="utf-8") as f:
             json.dump(quantize_cfg, f, indent=2)
     
     print(f"\n✓ Quantization complete!")
-    print(f"  - Quantized {len(metadata['quantized_modules'])} modules")
+    print(f"  - Quant method: {quant_method}")
     print(f"  - Output directory: {output_path}")
     print(f"  - Quantized weights file: {output_file}")
     print(f"  - Metadata file: {metadata_file}")
@@ -420,6 +731,48 @@ def main():
     parser.add_argument("--bits", type=int, default=4, help="每个权重的位数 (默认: 4)")
     parser.add_argument("--target-modules", type=str, help="要量化的模块名称模式（逗号分隔），例如: q_proj,k_proj,v_proj")
     parser.add_argument("--device", type=str, choices=["cpu", "cuda"], default="cpu", help="量化设备 (默认: cpu)")
+    parser.add_argument(
+        "--quant-method",
+        type=str,
+        choices=["auto", "simple"],
+        default="auto",
+        help="量化方法: auto(真 GPTQ/AWQ, 需要校准数据) / simple(旧实现, 无校准)",
+    )
+    parser.add_argument("--calib-text-file", type=str, default=None, help="校准文本文件（每行一条样本）")
+    parser.add_argument("--calib-num-samples", type=int, default=128, help="校准样本数 (默认: 128)")
+    parser.add_argument("--calib-seq-len", type=int, default=512, help="校准序列长度 (默认: 512)")
+    parser.add_argument("--calib-batch-size", type=int, default=1, help="校准 batch size (默认: 1)")
+    parser.add_argument("--calib-seed", type=int, default=0, help="校准采样随机种子 (默认: 0)")
+    parser.add_argument("--desc-act", action="store_true", help="GPTQ act-order(desc_act) (默认: False)")
+    parser.add_argument("--sym", dest="sym", action="store_true", default=True, help="GPTQ symmetric quant (默认: True)")
+    parser.add_argument("--no-sym", dest="sym", action="store_false", help="关闭 GPTQ symmetric quant")
+    parser.add_argument("--damp-percent", type=float, default=0.01, help="GPTQ damp_percent (默认: 0.01)")
+    parser.add_argument(
+        "--true-sequential",
+        dest="true_sequential",
+        action="store_true",
+        default=True,
+        help="GPTQ true_sequential (默认: True)",
+    )
+    parser.add_argument(
+        "--no-true-sequential",
+        dest="true_sequential",
+        action="store_false",
+        help="关闭 GPTQ true_sequential",
+    )
+    parser.add_argument(
+        "--use-triton",
+        dest="use_triton",
+        action="store_true",
+        default=True,
+        help="AutoGPTQ 使用 Triton backend (默认: True)",
+    )
+    parser.add_argument(
+        "--no-triton",
+        dest="use_triton",
+        action="store_false",
+        help="关闭 AutoGPTQ Triton backend（可能回退到 CUDA extension）",
+    )
     
     args = parser.parse_args()
     
@@ -435,6 +788,17 @@ def main():
         bits=args.bits,
         target_modules=target_modules,
         device=args.device,
+        quant_method=args.quant_method,
+        calib_text_file=args.calib_text_file,
+        calib_num_samples=args.calib_num_samples,
+        calib_seq_len=args.calib_seq_len,
+        calib_batch_size=args.calib_batch_size,
+        calib_seed=args.calib_seed,
+        desc_act=bool(args.desc_act),
+        sym=bool(args.sym),
+        damp_percent=float(args.damp_percent),
+        true_sequential=bool(args.true_sequential),
+        use_triton=bool(args.use_triton),
     )
 
 
diff --git a/diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py b/diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py
index da81d3e..c544166 100644
--- a/diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py
+++ b/diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py
@@ -112,8 +112,8 @@ def linear_forward(
         else:
             raise RuntimeError(f"gptq_marlin: unsupported weight_bits={weight_bits} (expected 4 or 8)")
 
-        # vLLM marlin kernels expect FP16 activations.
-        x_in = x.to(dtype=torch.float16) if x.dtype != torch.float16 else x
+        # Align with vLLM Marlin: accept bf16/fp16 activations directly.
+        x_in = x
 
         # g_idx can be empty (desc_act=False). Ensure correct dtype/device.
         if g_idx is None or (isinstance(g_idx, torch.Tensor) and g_idx.numel() == 0):
@@ -152,5 +152,5 @@ def linear_forward(
             bias=marlin_bias,
             input_dtype=None,
         )
-        return out.to(dtype=x.dtype) if out.dtype != x.dtype else out
+        return out
 

From 8824ccdbaf1a7651b617ceda9ab7f1a44974b57c Mon Sep 17 00:00:00 2001
From: luozixin2 <lzx@diffulex.local>
Date: Sun, 18 Jan 2026 15:33:01 +0000
Subject: [PATCH 04/10] =?UTF-8?q?refactor:=20=E4=BC=98=E5=8C=96=E4=BB=A3?=
 =?UTF-8?q?=E7=A0=81=E7=BB=93=E6=9E=84=E5=92=8C=E6=B6=88=E9=99=A4=E9=87=8D?=
 =?UTF-8?q?=E5=A4=8D=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

主要重构内容：

1. **diffulex/layer/linear.py** - 大幅简化量化逻辑（-197行）:
   - 新增 `_forward_base()`: 统一的前向分发器，替换子类中重复的量化分支逻辑
   - 新增 `_build_offline_forward_kwargs()`: 统一构建离线量化（GPTQ/AWQ）前向参数
   - 新增 `_get_linear_strategy()`, `_offline_meta()`, `_infer_gptq_weight_bits()` 等辅助方法
   - 修复 `LoRAMixin.merge_lora` 中 base weight 为 None 的边界情况
   - 移除未使用的导入（marlin_zero_points, unpack_cols, marlin_make_empty_g_idx）

2. **diffulex/utils/loader.py** - 优化性能和代码结构:
   - 一次性扫描 safetensors 文件建立 key_to_file 索引，避免重复文件 I/O
   - 缓存 `model.named_modules()` 结果，避免重复构建字典
   - 新增 `_find_offline_capable_module()`: 统一模块查找逻辑
   - 新增 `_load_tensors_for_prefix()`: 集中加载张量，仅打开必要的文件
   - 将 print() 替换为 logger.warning()/logger.exception() 以规范化日志

3. **diffulex/engine/model_runner.py** - 消除重复循环:
   - 在 `allocate_kv_cache` 中统一缓存 attention 模块列表
   - 用 `enumerate(attn_modules)` 替换重复的模块遍历循环

4. **diffulex/utils/quantization/strategies/linear_int4_w4a16.py** - 修复缺失实现:
   - 添加 `quantize_weight_for_kernel` 方法，修复 W4A16 在线量化运行时错误

5. 删除未使用的配置文件 `gptq_marlin_w2_bf16kv_varlen.yml`

测试: 已验证 W8A16 在线量化和 GPTQ 离线量化功能正常
---
 diffulex/engine/model_runner.py               |  34 +-
 diffulex/layer/linear.py                      | 492 ++++++------------
 diffulex/utils/loader.py                      | 220 ++++----
 .../strategies/linear_int4_w4a16.py           |  18 +
 .../configs/gptq_marlin_w2_bf16kv_varlen.yml  |  47 --
 5 files changed, 307 insertions(+), 504 deletions(-)
 delete mode 100644 diffulex_bench/configs/gptq_marlin_w2_bf16kv_varlen.yml

diff --git a/diffulex/engine/model_runner.py b/diffulex/engine/model_runner.py
index c347fb3..eaa6e0a 100755
--- a/diffulex/engine/model_runner.py
+++ b/diffulex/engine/model_runner.py
@@ -217,6 +217,13 @@ def allocate_kv_cache(self):
             f"for kv cache on rank {self.rank}."
         )
 
+        # Cache the list of Attention-like modules once, to keep binding logic consistent
+        # across cache layout branches (and avoid duplicated traversal).
+        attn_modules = [
+            m for m in self.model.modules()
+            if hasattr(m, "k_cache") and hasattr(m, "v_cache")
+        ]
+
         if config.kv_cache_layout == "distinct":
             x = config.k_cache_hdim_split_factor_x
             self.k_cache = torch.zeros(
@@ -236,12 +243,9 @@ def allocate_kv_cache(self):
                 self.block_size,
                 dtype=storage_dtype,
             )
-            layer_id = 0
-            for module in self.model.modules():
-                if hasattr(module, "k_cache") and hasattr(module, "v_cache"):
-                    module.k_cache = self.k_cache[layer_id]
-                    module.v_cache = self.v_cache[layer_id]
-                    layer_id += 1
+            for layer_id, module in enumerate(attn_modules):
+                module.k_cache = self.k_cache[layer_id]
+                module.v_cache = self.v_cache[layer_id]
         elif config.kv_cache_layout == "unified":
             self.kv_cache = torch.zeros(
                 2,
@@ -252,12 +256,9 @@ def allocate_kv_cache(self):
                 head_dim,
                 dtype=storage_dtype,
             )
-            layer_id = 0
-            for module in self.model.modules():
-                if hasattr(module, "k_cache") and hasattr(module, "v_cache"):
-                    module.k_cache = self.kv_cache[0, layer_id]
-                    module.v_cache = self.kv_cache[1, layer_id]
-                    layer_id += 1
+            for layer_id, module in enumerate(attn_modules):
+                module.k_cache = self.kv_cache[0, layer_id]
+                module.v_cache = self.kv_cache[1, layer_id]
         else:
             raise ValueError(
                 "Unsupported kv_cache_layout: {layout}. Supported values are 'distinct' and 'unified'.".format(
@@ -287,12 +288,9 @@ def allocate_kv_cache(self):
             self.v_scale[:] = v_scale_init[None, :]
             
             # Bind scales to Attention modules
-            layer_id = 0
-            for module in self.model.modules():
-                if hasattr(module, "k_cache") and hasattr(module, "v_cache"):
-                    module.k_scale = self.k_scale[layer_id]
-                    module.v_scale = self.v_scale[layer_id]
-                    layer_id += 1
+            for layer_id, module in enumerate(attn_modules):
+                module.k_scale = self.k_scale[layer_id]
+                module.v_scale = self.v_scale[layer_id]
 
     def prepare_block_tables(self, seqs: list[SequenceBase]):
         max_len = max(len(seq.block_table) for seq in seqs)
diff --git a/diffulex/layer/linear.py b/diffulex/layer/linear.py
index 0ba2ceb..f26566d 100755
--- a/diffulex/layer/linear.py
+++ b/diffulex/layer/linear.py
@@ -400,8 +400,6 @@ def _maybe_prepare_offline_gptq_marlin(self, x: torch.Tensor) -> None:
                 marlin_make_workspace_new,
                 marlin_permute_scales,
                 marlin_sort_g_idx,
-                marlin_zero_points,
-                unpack_cols,
             )
         except Exception as e:  # pragma: no cover
             raise RuntimeError(
@@ -510,7 +508,6 @@ def _maybe_prepare_offline_awq_marlin(self, x: torch.Tensor) -> None:
             from vllm import _custom_ops as ops  # type: ignore
             from vllm.model_executor.layers.quantization.utils.marlin_utils import (  # type: ignore
                 awq_to_marlin_zero_points,
-                marlin_make_empty_g_idx,
                 marlin_make_workspace_new,
                 marlin_permute_scales,
             )
@@ -570,8 +567,6 @@ def _maybe_prepare_offline_awq_marlin(self, x: torch.Tensor) -> None:
             is_a_8bit=False,
         )
 
-        # g_idx not used for AWQ marlin (keep empty, strategy will pass empties).
-        _ = marlin_make_empty_g_idx  # keep import referenced for clarity
         self._awq_marlin_is_prepared = torch.tensor(True, dtype=torch.bool, device=device)
 
     def set_quantized_weight(self, quant_weight_int8: torch.Tensor, quant_scales: torch.Tensor) -> None:
@@ -707,6 +702,168 @@ def _maybe_quantize_loaded_weight_param(
         # Keep attribute for compatibility, but ensure forward uses quant buffers.
         setattr(self, "weight", None)
 
+    def _get_linear_strategy(self):
+        """Return strategy for current `quant_kind` (or None).
+
+        NOTE: do not swallow TypeError here; a wrong strategy type should fail fast.
+        """
+        return get_linear_strategy(self.quant_kind)
+
+    def _offline_meta(self) -> tuple[int, int, int]:
+        """Return (out_features, in_features, group_size) for offline GPTQ/AWQ."""
+        return (
+            int(self._offline_quant_out_features.item()),
+            int(self._offline_quant_in_features.item()),
+            int(self._offline_quant_group_size.item()),
+        )
+
+    def _infer_gptq_weight_bits(self, *, in_features: int) -> int:
+        """Infer/return GPTQ weight bits for downstream kernels.
+
+        Priority:
+        - use recorded bits (e.g., marlin-exported layouts),
+        - otherwise infer from qweight packing.
+        """
+        bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 0
+        if bits > 0:
+            return bits
+        if self.gptq_qweight.numel() == 0:
+            raise RuntimeError("GPTQ bits 推断失败：gptq_qweight 为空。")
+        if self.gptq_qweight.shape[0] <= 0 or in_features % int(self.gptq_qweight.shape[0]) != 0:
+            raise RuntimeError(
+                f"GPTQ bits 推断失败：in_features={in_features}, qweight.shape={tuple(self.gptq_qweight.shape)}"
+            )
+        pack_factor = in_features // int(self.gptq_qweight.shape[0])
+        if 32 % pack_factor != 0:
+            raise RuntimeError(f"GPTQ bits 推断失败：pack_factor={pack_factor} 不满足 32%pack_factor==0")
+        return 32 // pack_factor
+
+    def _maybe_int4_original_in_features_kwargs(self, strategy, x: torch.Tensor) -> dict:
+        """Some int4 kernels need original K (before packing)."""
+        if strategy is None:
+            return {}
+        if getattr(strategy, "linear_weight_format", None) == "int4":
+            return {"original_in_features": x.shape[1]}
+        return {}
+
+    def _build_offline_forward_kwargs(self, x: torch.Tensor, strategy) -> dict:
+        """Build kwargs for offline GPTQ/AWQ (including Marlin variants)."""
+        if strategy is None:
+            raise RuntimeError("Offline quantized weight is present but no linear strategy is configured.")
+
+        format_val = int(self._offline_quant_format.item())
+        weight_format = getattr(strategy, "linear_weight_format", None)
+        out_features, in_features, group_size = self._offline_meta()
+
+        meta = {
+            "out_features": out_features,
+            "in_features": in_features,
+            "group_size": group_size,
+        }
+
+        if format_val == 1:  # GPTQ
+            # IMPORTANT: only gptq_gemm needs gptq_shuffle; marlin variants require the original format.
+            if weight_format == "gptq":
+                self._maybe_prepare_offline_gptq(x)
+                return {
+                    **meta,
+                    "gptq_qweight": self.gptq_qweight,
+                    "gptq_qzeros": self.gptq_qzeros,
+                    "gptq_scales": self.gptq_scales,
+                    "gptq_group_size": group_size,
+                    # Always pass g_idx (can be empty). vLLM expects it for GPTQ kernels.
+                    "gptq_g_idx": self.gptq_g_idx,
+                }
+
+            if weight_format == "gptq_marlin":
+                self._maybe_prepare_offline_gptq_marlin(x)
+                bits = self._infer_gptq_weight_bits(in_features=in_features)
+                return {
+                    **meta,
+                    "gptq_weight_bits": bits,
+                    "gptq_marlin_qweight": self.gptq_marlin_qweight,
+                    "gptq_marlin_scales": self.gptq_marlin_scales,
+                    "gptq_marlin_zp": self.gptq_marlin_zp,
+                    "gptq_marlin_g_idx": self.gptq_marlin_g_idx,
+                    "gptq_marlin_g_idx_sort_indices": self.gptq_marlin_g_idx_sort_indices,
+                    "gptq_marlin_workspace": self.gptq_marlin_workspace,
+                }
+
+            raise RuntimeError(
+                f"Offline GPTQ weights are present, but current strategy weight_format={weight_format!r} is not compatible."
+            )
+
+        if format_val == 2:  # AWQ
+            if weight_format == "awq":
+                return {
+                    **meta,
+                    "awq_qweight": self.awq_qweight,
+                    "awq_qzeros": self.awq_qzeros,
+                    "awq_scales": self.awq_scales,
+                    "awq_group_size": group_size,
+                }
+
+            if weight_format == "awq_marlin":
+                self._maybe_prepare_offline_awq_marlin(x)
+                return {
+                    **meta,
+                    "awq_marlin_qweight": self.awq_marlin_qweight,
+                    "awq_marlin_scales": self.awq_marlin_scales,
+                    "awq_marlin_zp": self.awq_marlin_zp,
+                    "awq_marlin_workspace": self.awq_marlin_workspace,
+                    "awq_weight_bits": 4,
+                }
+
+            raise RuntimeError(
+                f"Offline AWQ weights are present, but current strategy weight_format={weight_format!r} is not compatible."
+            )
+
+        raise RuntimeError(f"Unknown offline quant format: {format_val}")
+
+    def _forward_base(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor:
+        """Unified forward dispatcher for bf16 / online quant / offline GPTQ/AWQ."""
+        strategy = self._get_linear_strategy()
+        # Runtime safety net: ensure we don't keep bf16+quant weights both resident.
+        self._maybe_promote_weight_to_quantized_at_runtime(x, strategy)
+
+        # Offline quantized weights (GPTQ/AWQ) have higher priority.
+        if self.has_offline_quantized_weight():
+            if strategy is None:
+                raise RuntimeError("Offline quantized weight is present but no linear strategy is configured.")
+            kwargs = self._build_offline_forward_kwargs(x, strategy)
+            return strategy.linear_forward(
+                x,
+                None,  # weight not used for offline quantized weights
+                bias,
+                quant_kind=self.quant_kind,
+                **kwargs,
+            )
+
+        if self.has_quantized_weight():
+            if strategy is None:
+                raise RuntimeError("Quantized weight is present but no linear strategy is configured.")
+            kwargs = {"quant_scales": self.quant_scales}
+            kwargs.update(self._maybe_int4_original_in_features_kwargs(strategy, x))
+            return strategy.linear_forward(
+                x,
+                self.quant_weight_int8,
+                bias,
+                quant_kind=self.quant_kind,
+                **kwargs,
+            )
+
+        if strategy is None:
+            weight = getattr(self, "weight", None)
+            if weight is None:
+                raise RuntimeError("No strategy is configured and bf16 weight is missing.")
+            return F.linear(x, weight, bias)
+
+        weight = getattr(self, "weight", None)
+        if weight is None:
+            raise RuntimeError("Strategy is configured but weight is missing (expected bf16 weight).")
+        kwargs = self._maybe_int4_original_in_features_kwargs(strategy, x)
+        return strategy.linear_forward(x, weight, bias, quant_kind=self.quant_kind, **kwargs)
+
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         raise NotImplementedError
 
@@ -739,115 +896,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
         self._maybe_quantize_loaded_weight_param(param, loaded_shard_id=None, expected_shard_ids={None})
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        strategy = get_linear_strategy(self.quant_kind)
-        # Runtime safety net: ensure we don't keep bf16+quant weights both resident.
-        self._maybe_promote_weight_to_quantized_at_runtime(x, strategy)
-        
-        # Check for offline quantized weights (GPTQ/AWQ) first
-        if self.has_offline_quantized_weight():
-            if strategy is None:
-                raise RuntimeError("Offline quantized weight is present but no linear strategy is configured.")
-            format_val = int(self._offline_quant_format.item())
-            out_features = int(self._offline_quant_out_features.item())
-            in_features = int(self._offline_quant_in_features.item())
-            group_size = int(self._offline_quant_group_size.item())
-            weight_format = getattr(strategy, "linear_weight_format", None)
-            
-            kwargs = {
-                "out_features": out_features,
-                "in_features": in_features,
-                "group_size": group_size,
-            }
-            
-            if format_val == 1:  # GPTQ
-                # IMPORTANT: only gptq_gemm needs gptq_shuffle; marlin variants require the original format.
-                if weight_format == "gptq":
-                    self._maybe_prepare_offline_gptq(x)
-                    kwargs.update({
-                        "gptq_qweight": self.gptq_qweight,
-                        "gptq_qzeros": self.gptq_qzeros,
-                        "gptq_scales": self.gptq_scales,
-                        "gptq_group_size": group_size,
-                    })
-                    # Always pass g_idx (can be empty). vLLM expects it for GPTQ kernels.
-                    kwargs["gptq_g_idx"] = self.gptq_g_idx
-                elif weight_format == "gptq_marlin":
-                    self._maybe_prepare_offline_gptq_marlin(x)
-                    # Expose bits (needed to select scalar_types.* in strategy).
-                    bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 0
-                    if bits <= 0:
-                        pack_factor = in_features // int(self.gptq_qweight.shape[0])
-                        bits = 32 // pack_factor
-                    kwargs["gptq_weight_bits"] = bits
-                    kwargs.update({
-                        "gptq_marlin_qweight": self.gptq_marlin_qweight,
-                        "gptq_marlin_scales": self.gptq_marlin_scales,
-                        "gptq_marlin_zp": self.gptq_marlin_zp,
-                        "gptq_marlin_g_idx": self.gptq_marlin_g_idx,
-                        "gptq_marlin_g_idx_sort_indices": self.gptq_marlin_g_idx_sort_indices,
-                        "gptq_marlin_workspace": self.gptq_marlin_workspace,
-                    })
-                else:
-                    raise RuntimeError(
-                        f"Offline GPTQ weights are present, but current strategy weight_format={weight_format!r} "
-                        "is not compatible."
-                    )
-            elif format_val == 2:  # AWQ
-                if weight_format == "awq":
-                    kwargs.update({
-                        "awq_qweight": self.awq_qweight,
-                        "awq_qzeros": self.awq_qzeros,
-                        "awq_scales": self.awq_scales,
-                        "awq_group_size": group_size,
-                    })
-                elif weight_format == "awq_marlin":
-                    self._maybe_prepare_offline_awq_marlin(x)
-                    kwargs.update({
-                        "awq_marlin_qweight": self.awq_marlin_qweight,
-                        "awq_marlin_scales": self.awq_marlin_scales,
-                        "awq_marlin_zp": self.awq_marlin_zp,
-                        "awq_marlin_workspace": self.awq_marlin_workspace,
-                        "awq_weight_bits": 4,
-                    })
-                else:
-                    raise RuntimeError(
-                        f"Offline AWQ weights are present, but current strategy weight_format={weight_format!r} "
-                        "is not compatible."
-                    )
-            
-            base_out = strategy.linear_forward(
-                x,
-                None,  # weight not used for offline quantized weights
-                self.bias,
-                quant_kind=self.quant_kind,
-                **kwargs,
-            )
-        elif self.has_quantized_weight():
-            if strategy is None:
-                raise RuntimeError("Quantized weight is present but no linear strategy is configured.")
-            # For int4 (W4A16), we need to pass original_in_features
-            weight_format = getattr(strategy, "linear_weight_format", None)
-            kwargs = {"quant_scales": self.quant_scales}
-            if weight_format == "int4":
-                # For int4, packed weight shape is [out_features, (in_features + 1) // 2]
-                # We use x.shape[1] as the source of truth (it's the actual K dimension)
-                kwargs["original_in_features"] = x.shape[1]
-            base_out = strategy.linear_forward(
-                x,
-                self.quant_weight_int8,
-                self.bias,
-                quant_kind=self.quant_kind,
-                **kwargs,
-            )
-        elif strategy is None:
-            base_out = F.linear(x, self.weight, self.bias)
-        else:
-            # For int4 strategies (W4A16/W4A8), we need to pass original_in_features even when weight is not quantized yet
-            weight_format = getattr(strategy, "linear_weight_format", None)
-            kwargs = {}
-            if weight_format == "int4":
-                kwargs["original_in_features"] = x.shape[1]
-            base_out = strategy.linear_forward(x, self.weight, self.bias, quant_kind=self.quant_kind, **kwargs)
+        base_out = self._forward_base(x, self.bias)
         return self.lora_forward(x, base_out)
 
 
@@ -886,112 +935,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
         self._maybe_quantize_loaded_weight_param(param, loaded_shard_id=None, expected_shard_ids={None})
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        strategy = get_linear_strategy(self.quant_kind)
-        # Runtime safety net: ensure we don't keep bf16+quant weights both resident.
-        self._maybe_promote_weight_to_quantized_at_runtime(x, strategy)
-        
-        # Check for offline quantized weights (GPTQ/AWQ) first
-        if self.has_offline_quantized_weight():
-            if strategy is None:
-                raise RuntimeError("Offline quantized weight is present but no linear strategy is configured.")
-            format_val = int(self._offline_quant_format.item())
-            out_features = int(self._offline_quant_out_features.item())
-            in_features = int(self._offline_quant_in_features.item())
-            group_size = int(self._offline_quant_group_size.item())
-            weight_format = getattr(strategy, "linear_weight_format", None)
-            
-            kwargs = {
-                "out_features": out_features,
-                "in_features": in_features,
-                "group_size": group_size,
-            }
-            
-            if format_val == 1:  # GPTQ
-                if weight_format == "gptq":
-                    self._maybe_prepare_offline_gptq(x)
-                    kwargs.update({
-                        "gptq_qweight": self.gptq_qweight,
-                        "gptq_qzeros": self.gptq_qzeros,
-                        "gptq_scales": self.gptq_scales,
-                        "gptq_group_size": group_size,
-                    })
-                    kwargs["gptq_g_idx"] = self.gptq_g_idx
-                elif weight_format == "gptq_marlin":
-                    self._maybe_prepare_offline_gptq_marlin(x)
-                    bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 0
-                    if bits <= 0:
-                        pack_factor = in_features // int(self.gptq_qweight.shape[0])
-                        bits = 32 // pack_factor
-                    kwargs["gptq_weight_bits"] = bits
-                    kwargs.update({
-                        "gptq_marlin_qweight": self.gptq_marlin_qweight,
-                        "gptq_marlin_scales": self.gptq_marlin_scales,
-                        "gptq_marlin_zp": self.gptq_marlin_zp,
-                        "gptq_marlin_g_idx": self.gptq_marlin_g_idx,
-                        "gptq_marlin_g_idx_sort_indices": self.gptq_marlin_g_idx_sort_indices,
-                        "gptq_marlin_workspace": self.gptq_marlin_workspace,
-                    })
-                else:
-                    raise RuntimeError(
-                        f"Offline GPTQ weights are present, but current strategy weight_format={weight_format!r} "
-                        "is not compatible."
-                    )
-            elif format_val == 2:  # AWQ
-                if weight_format == "awq":
-                    kwargs.update({
-                        "awq_qweight": self.awq_qweight,
-                        "awq_qzeros": self.awq_qzeros,
-                        "awq_scales": self.awq_scales,
-                        "awq_group_size": group_size,
-                    })
-                elif weight_format == "awq_marlin":
-                    self._maybe_prepare_offline_awq_marlin(x)
-                    kwargs.update({
-                        "awq_marlin_qweight": self.awq_marlin_qweight,
-                        "awq_marlin_scales": self.awq_marlin_scales,
-                        "awq_marlin_zp": self.awq_marlin_zp,
-                        "awq_marlin_workspace": self.awq_marlin_workspace,
-                        "awq_weight_bits": 4,
-                    })
-                else:
-                    raise RuntimeError(
-                        f"Offline AWQ weights are present, but current strategy weight_format={weight_format!r} "
-                        "is not compatible."
-                    )
-            
-            base_out = strategy.linear_forward(
-                x,
-                None,  # weight not used for offline quantized weights
-                self.bias,
-                quant_kind=self.quant_kind,
-                **kwargs,
-            )
-        elif self.has_quantized_weight():
-            if strategy is None:
-                raise RuntimeError("Quantized weight is present but no linear strategy is configured.")
-            # For int4 (W4A16), we need to pass original_in_features
-            weight_format = getattr(strategy, "linear_weight_format", None)
-            kwargs = {"quant_scales": self.quant_scales}
-            if weight_format == "int4":
-                # For int4, packed weight shape is [out_features, (in_features + 1) // 2]
-                # We use x.shape[1] as the source of truth (it's the actual K dimension)
-                kwargs["original_in_features"] = x.shape[1]
-            base_out = strategy.linear_forward(
-                x,
-                self.quant_weight_int8,
-                self.bias,
-                quant_kind=self.quant_kind,
-                **kwargs,
-            )
-        elif strategy is None:
-            base_out = F.linear(x, self.weight, self.bias)
-        else:
-            # For int4 strategies (W4A16/W4A8), we need to pass original_in_features even when weight is not quantized yet
-            weight_format = getattr(strategy, "linear_weight_format", None)
-            kwargs = {}
-            if weight_format == "int4":
-                kwargs["original_in_features"] = x.shape[1]
-            base_out = strategy.linear_forward(x, self.weight, self.bias, quant_kind=self.quant_kind, **kwargs)
+        base_out = self._forward_base(x, self.bias)
         return self.lora_forward(x, base_out)
 
 
@@ -1107,113 +1051,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         bias = self.bias if self.tp_rank == 0 else None
-        strategy = get_linear_strategy(self.quant_kind)
-        # Runtime safety net: ensure we don't keep bf16+quant weights both resident.
-        self._maybe_promote_weight_to_quantized_at_runtime(x, strategy)
-        
-        # Check for offline quantized weights (GPTQ/AWQ) first
-        if self.has_offline_quantized_weight():
-            if strategy is None:
-                raise RuntimeError("Offline quantized weight is present but no linear strategy is configured.")
-            format_val = int(self._offline_quant_format.item())
-            out_features = int(self._offline_quant_out_features.item())
-            in_features = int(self._offline_quant_in_features.item())
-            group_size = int(self._offline_quant_group_size.item())
-            weight_format = getattr(strategy, "linear_weight_format", None)
-            
-            kwargs = {
-                "out_features": out_features,
-                "in_features": in_features,
-                "group_size": group_size,
-            }
-            
-            if format_val == 1:  # GPTQ
-                if weight_format == "gptq":
-                    # vLLM requires gptq_shuffle before first gptq_gemm.
-                    self._maybe_prepare_offline_gptq(x)
-                    kwargs.update({
-                        "gptq_qweight": self.gptq_qweight,
-                        "gptq_qzeros": self.gptq_qzeros,
-                        "gptq_scales": self.gptq_scales,
-                        "gptq_group_size": group_size,
-                    })
-                    # Always pass g_idx (can be empty); strategy will normalize dtype/device.
-                    kwargs["gptq_g_idx"] = self.gptq_g_idx
-                elif weight_format == "gptq_marlin":
-                    self._maybe_prepare_offline_gptq_marlin(x)
-                    bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 0
-                    if bits <= 0:
-                        pack_factor = in_features // int(self.gptq_qweight.shape[0])
-                        bits = 32 // pack_factor
-                    kwargs["gptq_weight_bits"] = bits
-                    kwargs.update({
-                        "gptq_marlin_qweight": self.gptq_marlin_qweight,
-                        "gptq_marlin_scales": self.gptq_marlin_scales,
-                        "gptq_marlin_zp": self.gptq_marlin_zp,
-                        "gptq_marlin_g_idx": self.gptq_marlin_g_idx,
-                        "gptq_marlin_g_idx_sort_indices": self.gptq_marlin_g_idx_sort_indices,
-                        "gptq_marlin_workspace": self.gptq_marlin_workspace,
-                    })
-                else:
-                    raise RuntimeError(
-                        f"Offline GPTQ weights are present, but current strategy weight_format={weight_format!r} "
-                        "is not compatible."
-                    )
-            elif format_val == 2:  # AWQ
-                if weight_format == "awq":
-                    kwargs.update({
-                        "awq_qweight": self.awq_qweight,
-                        "awq_qzeros": self.awq_qzeros,
-                        "awq_scales": self.awq_scales,
-                        "awq_group_size": group_size,
-                    })
-                elif weight_format == "awq_marlin":
-                    self._maybe_prepare_offline_awq_marlin(x)
-                    kwargs.update({
-                        "awq_marlin_qweight": self.awq_marlin_qweight,
-                        "awq_marlin_scales": self.awq_marlin_scales,
-                        "awq_marlin_zp": self.awq_marlin_zp,
-                        "awq_marlin_workspace": self.awq_marlin_workspace,
-                        "awq_weight_bits": 4,
-                    })
-                else:
-                    raise RuntimeError(
-                        f"Offline AWQ weights are present, but current strategy weight_format={weight_format!r} "
-                        "is not compatible."
-                    )
-            
-            y = strategy.linear_forward(
-                x,
-                None,  # weight not used for offline quantized weights
-                bias,
-                quant_kind=self.quant_kind,
-                **kwargs,
-            )
-        elif self.has_quantized_weight():
-            if strategy is None:
-                raise RuntimeError("Quantized weight is present but no linear strategy is configured.")
-            # For int4 (W4A16), we must pass original_in_features to disambiguate packed K.
-            weight_format = getattr(strategy, "linear_weight_format", None)
-            kwargs = {"quant_scales": self.quant_scales}
-            if weight_format == "int4":
-                # Use activation K as the source of truth (it's the actual K dimension).
-                kwargs["original_in_features"] = x.shape[1]
-            y = strategy.linear_forward(
-                x,
-                self.quant_weight_int8,
-                bias,
-                quant_kind=self.quant_kind,
-                **kwargs,
-            )
-        elif strategy is None:
-            y = F.linear(x, self.weight, bias)
-        else:
-            # For int4 strategies (W4A16/W4A8), we need to pass original_in_features even when weight is not quantized yet
-            weight_format = getattr(strategy, "linear_weight_format", None)
-            kwargs = {}
-            if weight_format == "int4":
-                kwargs["original_in_features"] = x.shape[1]
-            y = strategy.linear_forward(x, self.weight, bias, quant_kind=self.quant_kind, **kwargs)
+        y = self._forward_base(x, bias)
         if self.tp_size > 1:
             dist.all_reduce(y)
         return self.lora_forward(x, y)
diff --git a/diffulex/utils/loader.py b/diffulex/utils/loader.py
index 622e7e2..73ffb92 100755
--- a/diffulex/utils/loader.py
+++ b/diffulex/utils/loader.py
@@ -226,38 +226,78 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
     if not (use_gptq or use_awq):
         return loaded_gptq, loaded_awq, skipped
     
-    # Collect all weight names from safetensors files
-    all_keys = []
     all_files = list(glob(os.path.join(config.model, "*.safetensors")))
+
+    # Scan keys once and remember which file contains each key.
+    # This avoids the O(num_modules * num_files) "search every file for every module" pattern below.
+    key_to_file: dict[str, str] = {}
+    module_keys: dict[str, dict[str, str]] = {}
+    offline_suffixes = (".qweight", ".qzeros", ".scales", ".g_idx")
     for file in all_files:
         with safe_open(file, "pt", "cpu") as f:
-            all_keys.extend(f.keys())
-    
-    # Group keys by module prefix
-    module_keys: dict[str, dict[str, str]] = {}
-    for key in all_keys:
-        # Check for GPTQ/AWQ keys: {prefix}.qweight, {prefix}.qzeros, {prefix}.scales, {prefix}.g_idx (GPTQ only)
-        if key.endswith(".qweight"):
-            prefix = key[:-8]  # Remove ".qweight"
-            if prefix not in module_keys:
-                module_keys[prefix] = {}
-            module_keys[prefix]["qweight"] = key
-        elif key.endswith(".qzeros"):
-            prefix = key[:-7]  # Remove ".qzeros"
-            if prefix not in module_keys:
-                module_keys[prefix] = {}
-            module_keys[prefix]["qzeros"] = key
-        elif key.endswith(".scales"):
-            prefix = key[:-7]  # Remove ".scales"
-            if prefix not in module_keys:
-                module_keys[prefix] = {}
-            module_keys[prefix]["scales"] = key
-        elif key.endswith(".g_idx"):
-            prefix = key[:-6]  # Remove ".g_idx"
-            if prefix not in module_keys:
-                module_keys[prefix] = {}
-            module_keys[prefix]["g_idx"] = key
+            for key in f.keys():
+                if not key.endswith(offline_suffixes):
+                    continue
+                key_to_file[key] = file
+                # Group by module prefix: {prefix}.qweight, {prefix}.qzeros, {prefix}.scales, {prefix}.g_idx (GPTQ only)
+                if key.endswith(".qweight"):
+                    prefix = key[:-8]
+                    module_keys.setdefault(prefix, {})["qweight"] = key
+                elif key.endswith(".qzeros"):
+                    prefix = key[:-7]
+                    module_keys.setdefault(prefix, {})["qzeros"] = key
+                elif key.endswith(".scales"):
+                    prefix = key[:-7]
+                    module_keys.setdefault(prefix, {})["scales"] = key
+                else:  # .g_idx
+                    prefix = key[:-6]
+                    module_keys.setdefault(prefix, {})["g_idx"] = key
+
+    # Cache modules lookup to avoid rebuilding dict(model.named_modules()) repeatedly.
+    named_modules = dict(model.named_modules())
+    offline_capable_modules: dict[str, nn.Module] = {
+        name: m for name, m in named_modules.items() if hasattr(m, "set_offline_quantized_weight")
+    }
     
+    def _find_offline_capable_module(module_name: str) -> nn.Module | None:
+        """Best-effort resolve module_name to a module with offline quant support."""
+        m = offline_capable_modules.get(module_name)
+        if m is not None:
+            return m
+
+        # Try a few naming fallbacks (keep behavior compatible with the previous implementation).
+        leaf = module_name.split(".")[-1] if module_name else module_name
+        for name, cand in offline_capable_modules.items():
+            if (
+                name == module_name
+                or name.endswith("." + module_name)
+                or module_name.endswith("." + name)
+                or (name.split(".")[-1] == leaf)
+            ):
+                return cand
+        return None
+
+    def _load_tensors_for_prefix(key_dict: dict[str, str], *, want_g_idx: bool) -> tuple[torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None]:
+        """Load qweight/qzeros/scales/(g_idx) from the minimal set of safetensors files."""
+        qweight = qzeros = scales = g_idx = None
+        keys = [key_dict.get("qweight"), key_dict.get("qzeros"), key_dict.get("scales")]
+        if want_g_idx:
+            keys.append(key_dict.get("g_idx"))
+        files_needed = {key_to_file.get(k) for k in keys if k}
+        files_needed.discard(None)
+
+        for file in files_needed:
+            with safe_open(file, "pt", "cpu") as f:
+                if qweight is None and (key_dict.get("qweight") in f.keys()):
+                    qweight = f.get_tensor(key_dict["qweight"])
+                if qzeros is None and (key_dict.get("qzeros") in f.keys()):
+                    qzeros = f.get_tensor(key_dict["qzeros"])
+                if scales is None and (key_dict.get("scales") in f.keys()):
+                    scales = f.get_tensor(key_dict["scales"])
+                if want_g_idx and g_idx is None and ("g_idx" in key_dict) and (key_dict["g_idx"] in f.keys()):
+                    g_idx = f.get_tensor(key_dict["g_idx"])
+        return qweight, qzeros, scales, g_idx
+
     # Load GPTQ/AWQ weights for each module
     packed_modules_mapping = getattr(model, "packed_modules_mapping", {})
     
@@ -272,31 +312,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
                 module_name = prefix.replace(k, v)
                 break
         
-        # Try to find the module
         try:
-            module = None
-            # Try exact match first
-            try:
-                module = dict(model.named_modules())[module_name]
-                if not hasattr(module, "set_offline_quantized_weight"):
-                    module = None
-            except KeyError:
-                pass
-            
-            # Try partial match if exact match failed
-            if module is None:
-                for name, m in model.named_modules():
-                    # Handle different naming conventions
-                    if (
-                        name == module_name
-                        or name.endswith("." + module_name)
-                        or module_name.endswith("." + name)
-                        or (name.split(".")[-1] == module_name.split(".")[-1])
-                    ):
-                        if hasattr(m, "set_offline_quantized_weight"):
-                            module = m
-                            break
-            
+            module = _find_offline_capable_module(module_name)
             if module is None:
                 skipped += 1
                 continue
@@ -316,27 +333,10 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
                 skipped += 1
                 continue
             
-            # Load tensors from safetensors files
-            qweight = None
-            qzeros = None
-            scales = None
-            g_idx = None
-            
-            for file in all_files:
-                with safe_open(file, "pt", "cpu") as f:
-                    if key_dict["qweight"] in f.keys() and qweight is None:
-                        qweight = f.get_tensor(key_dict["qweight"])
-                    if key_dict["qzeros"] in f.keys() and qzeros is None:
-                        qzeros = f.get_tensor(key_dict["qzeros"])
-                    if key_dict["scales"] in f.keys() and scales is None:
-                        scales = f.get_tensor(key_dict["scales"])
-                    if format == "gptq" and "g_idx" in key_dict and key_dict["g_idx"] in f.keys() and g_idx is None:
-                        g_idx = f.get_tensor(key_dict["g_idx"])
-                
-                # Early exit if all required tensors are loaded
-                if qweight is not None and qzeros is not None and scales is not None:
-                    if format != "gptq" or g_idx is not None:
-                        break
+            # Load tensors from the minimal set of safetensors files.
+            qweight, qzeros, scales, g_idx = _load_tensors_for_prefix(
+                key_dict, want_g_idx=(format == "gptq")
+            )
             
             if qweight is None or qzeros is None or scales is None:
                 skipped += 1
@@ -352,8 +352,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
                     out_features = int(scales.shape[1]) if scales.ndim == 2 else int(qweight.shape[1])
                     in_features = int(qweight.shape[0]) * 16
                     if ckpt_bits not in (4, 8):
-                        print(
-                            f"Warning: gptq_marlin requires bits=4/8, got bits={ckpt_bits} for {module_name}. Skipping."
+                        logger.warning(
+                            f"gptq_marlin requires bits=4/8, got bits={ckpt_bits} for {module_name}. Skipping."
                         )
                         skipped += 1
                         continue
@@ -365,17 +365,17 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
                     # qzeros: [K/group, N/pack] (may be empty for some checkpoints)
                     if getattr(qzeros, "numel", lambda: 1)() == 0:
                         if ckpt_bits not in (2, 4, 8):
-                            print(
-                                f"Warning: qzeros is empty and cannot infer bits for {module_name}. "
-                                f"Please ensure quantize_config.json contains bits (2/4/8). Skipping."
+                            logger.warning(
+                                f"qzeros is empty and cannot infer bits for {module_name}. "
+                                "Please ensure quantize_config.json contains bits (2/4/8). Skipping."
                             )
                             skipped += 1
                             continue
                         pack_factor = 32 // int(ckpt_bits)
                     else:
                         if int(qzeros.shape[1]) <= 0 or out_features % int(qzeros.shape[1]) != 0:
-                            print(
-                                f"Warning: Cannot infer GPTQ pack_factor from qzeros for {module_name}: "
+                            logger.warning(
+                                f"Cannot infer GPTQ pack_factor from qzeros for {module_name}: "
                                 f"qzeros.shape={tuple(qzeros.shape)}, qweight.shape={tuple(qweight.shape)}. Skipping."
                             )
                             skipped += 1
@@ -386,8 +386,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
                 # awq: qweight: [K, N/pack], scales: [K/group, N]
                 out_features = int(scales.shape[1]) if scales.ndim == 2 else int(qweight.shape[1])
                 if int(qweight.shape[1]) <= 0 or out_features % int(qweight.shape[1]) != 0:
-                    print(
-                        f"Warning: Cannot infer AWQ pack_factor from scales/qweight for {module_name}: "
+                    logger.warning(
+                        f"Cannot infer AWQ pack_factor from scales/qweight for {module_name}: "
                         f"scales.shape={tuple(scales.shape)}, qweight.shape={tuple(qweight.shape)}. Skipping."
                     )
                     skipped += 1
@@ -428,9 +428,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
             ):
                 group_size_norm = in_features if group_size == -1 else group_size
                 if group_size_norm <= 0 or (in_features % group_size_norm) != 0:
-                    print(
-                        f"Warning: Invalid group_size={group_size} for {module_name} with in_features={in_features}. "
-                        "Skipping."
+                    logger.warning(
+                        f"Invalid group_size={group_size} for {module_name} with in_features={in_features}. Skipping."
                     )
                     skipped += 1
                     continue
@@ -443,7 +442,7 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
                         device=qweight.device,
                     )
                 except Exception as e:
-                    print(f"Warning: Failed to create dummy qzeros for {module_name}: {e}. Skipping.")
+                    logger.warning(f"Failed to create dummy qzeros for {module_name}: {e}. Skipping.")
                     skipped += 1
                     continue
             
@@ -455,9 +454,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
             tp_dim = getattr(module, "tp_dim", None)
             if tp_size > 1:
                 if tp_dim not in (0, 1):
-                    print(
-                        f"Warning: Unsupported tp_dim={tp_dim} for offline quantized weights. "
-                        f"Skipping {module_name}."
+                    logger.warning(
+                        f"Unsupported tp_dim={tp_dim} for offline quantized weights. Skipping {module_name}."
                     )
                     skipped += 1
                     continue
@@ -465,8 +463,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
                 # Shard along output features (N) for column-parallel modules.
                 if tp_dim == 0:
                     if out_features % tp_size != 0:
-                        print(
-                            f"Warning: out_features={out_features} not divisible by TP={tp_size} for {module_name}. "
+                        logger.warning(
+                            f"out_features={out_features} not divisible by TP={tp_size} for {module_name}. "
                             "Skipping offline quant weights for this module."
                         )
                         skipped += 1
@@ -475,8 +473,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
                     out_start = tp_rank * out_per
                     out_end = out_start + out_per
                     if out_per % pack_factor != 0:
-                        print(
-                            f"Warning: out_features_per_partition={out_per} not divisible by pack_factor={pack_factor} "
+                        logger.warning(
+                            f"out_features_per_partition={out_per} not divisible by pack_factor={pack_factor} "
                             f"for {module_name}. Skipping."
                         )
                         skipped += 1
@@ -490,7 +488,9 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
                             # Marlin qweight packs N by a factor (bits/2): N_packed = N * (bits/2)
                             n_factor = int(ckpt_bits) // 2
                             if n_factor <= 0:
-                                print(f"Warning: invalid gptq_marlin n_factor for bits={ckpt_bits} ({module_name}). Skipping.")
+                                logger.warning(
+                                    f"invalid gptq_marlin n_factor for bits={ckpt_bits} ({module_name}). Skipping."
+                                )
                                 skipped += 1
                                 continue
                             qweight = qweight[:, (out_start * n_factor):(out_end * n_factor)]
@@ -516,8 +516,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
                 # Shard along input features (K) for row-parallel modules.
                 elif tp_dim == 1:
                     if in_features % tp_size != 0:
-                        print(
-                            f"Warning: in_features={in_features} not divisible by TP={tp_size} for {module_name}. "
+                        logger.warning(
+                            f"in_features={in_features} not divisible by TP={tp_size} for {module_name}. "
                             "Skipping offline quant weights for this module."
                         )
                         skipped += 1
@@ -526,8 +526,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
                     in_start = tp_rank * in_per
                     in_end = in_start + in_per
                     if group_size <= 0 or (in_per % group_size) != 0 or (in_start % group_size) != 0:
-                        print(
-                            f"Warning: group_size={group_size} incompatible with TP sharding for {module_name} "
+                        logger.warning(
+                            f"group_size={group_size} incompatible with TP sharding for {module_name} "
                             f"(in_per={in_per}, in_start={in_start}). Skipping."
                         )
                         skipped += 1
@@ -539,8 +539,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
                         if is_gptq_marlin_ckpt:
                             # Marlin qweight packs K in tiles of 16: K_packed = K / 16
                             if in_start % 16 != 0:
-                                print(
-                                    f"Warning: gptq_marlin requires in_start divisible by 16, got in_start={in_start} "
+                                logger.warning(
+                                    f"gptq_marlin requires in_start divisible by 16, got in_start={in_start} "
                                     f"for {module_name}. Skipping."
                                 )
                                 skipped += 1
@@ -553,8 +553,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
                             group_size_norm = in_features if group_size == -1 else group_size
                             expected_num_groups = in_features // group_size_norm if group_size_norm > 0 else 0
                             if expected_num_groups <= 0:
-                                print(
-                                    f"Warning: invalid expected_num_groups={expected_num_groups} for {module_name}. Skipping."
+                                logger.warning(
+                                    f"invalid expected_num_groups={expected_num_groups} for {module_name}. Skipping."
                                 )
                                 skipped += 1
                                 continue
@@ -564,8 +564,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
                                 # Legacy/alternate layout: [2*num_groups, N/2]
                                 scales = scales[(2 * g_start):(2 * g_end), :]
                             else:
-                                print(
-                                    f"Warning: unexpected gptq_marlin scales.shape[0]={int(scales.shape[0])} "
+                                logger.warning(
+                                    f"unexpected gptq_marlin scales.shape[0]={int(scales.shape[0])} "
                                     f"(expected {expected_num_groups} or {2*expected_num_groups}) for {module_name}. Skipping."
                                 )
                                 skipped += 1
@@ -576,8 +576,8 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
                         else:
                             # qweight: [K/pack, N] (packed on K)
                             if in_start % pack_factor != 0:
-                                print(
-                                    f"Warning: in_start={in_start} not divisible by pack_factor={pack_factor} "
+                                logger.warning(
+                                    f"in_start={in_start} not divisible by pack_factor={pack_factor} "
                                     f"for {module_name}. Skipping."
                                 )
                                 skipped += 1
@@ -632,15 +632,11 @@ def _load_gptq_awq_weights(model: nn.Module, config: Config):
                 else:
                     loaded_awq += 1
             except Exception as e:
-                print(f"Failed to load offline quantized weights for {module_name}: {e}")
-                import traceback
-                traceback.print_exc()
+                logger.exception(f"Failed to load offline quantized weights for {module_name}: {e}")
                 skipped += 1
         
         except Exception as e:
-            print(f"Error loading offline quantized weights for {prefix}: {e}")
-            import traceback
-            traceback.print_exc()
+            logger.exception(f"Error loading offline quantized weights for {prefix}: {e}")
             skipped += 1
     
     return loaded_gptq, loaded_awq, skipped
diff --git a/diffulex/utils/quantization/strategies/linear_int4_w4a16.py b/diffulex/utils/quantization/strategies/linear_int4_w4a16.py
index e1b085e..870a860 100644
--- a/diffulex/utils/quantization/strategies/linear_int4_w4a16.py
+++ b/diffulex/utils/quantization/strategies/linear_int4_w4a16.py
@@ -89,6 +89,24 @@ def quantize(self, tensor: torch.Tensor, **kwargs: Any) -> tuple[torch.Tensor, A
         packed = self._pack_int4_to_int8(q)
         return packed, {"scales": scales}
 
+    def quantize_weight_for_kernel(
+        self,
+        weight: torch.Tensor,
+        *,
+        device: torch.device | None = None,
+        **_: Any,
+    ) -> tuple[torch.Tensor, Any]:
+        """Quantize+pack bf16 weight for kernel consumption.
+
+        Returns:
+            (packed_int8 [N, ceil(K/2)], scales_fp32 [N])
+        """
+        packed, meta = self.quantize(weight)
+        if device is not None:
+            packed = packed.to(device=device)
+            meta["scales"] = meta["scales"].to(device=device)
+        return packed, meta["scales"]
+
     def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs: Any) -> torch.Tensor:
         original_k = int(kwargs.get("original_in_features", 0))
         if original_k <= 0:
diff --git a/diffulex_bench/configs/gptq_marlin_w2_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_marlin_w2_bf16kv_varlen.yml
deleted file mode 100644
index bae9875..0000000
--- a/diffulex_bench/configs/gptq_marlin_w2_bf16kv_varlen.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# GPTQ Marlin (W2, A16) + BF16 KV Cache (varlen mode)
-engine:
-  model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-w2"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 2048
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # Required for varlen mode
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: GPTQ Marlin (W2) + BF16 activations + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "varlen"
-  linear_attn_weight_dtype: "gptq_marlin"
-  linear_mlp_weight_dtype: "gptq_marlin"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_varlen/gptq_marlin_w2_bf16kv"
-  save_results: true
-  use_tqdm: true

From 23d377a9624b8600ac3a8486a46b2f7c6e9c8b77 Mon Sep 17 00:00:00 2001
From: luozixin2 <lzx@diffulex.local>
Date: Sat, 24 Jan 2026 09:17:01 +0000
Subject: [PATCH 05/10] =?UTF-8?q?fix:=20=E4=BF=AE=E6=AD=A3=20bench=20?=
 =?UTF-8?q?=E4=B8=AD=20prefill/decode=20=E5=90=9E=E5=90=90=E9=87=8F?=
 =?UTF-8?q?=E7=9A=84=E5=B9=B3=E5=9D=87=E5=80=BC=E8=AE=A1=E7=AE=97?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 将最后总结从最后一步的瞬时吞吐改为真正的平均值（总token/总时间）
- 新增 ms/step 统计信息，便于分析性能
- 修复了之前只显示最后一步瞬时值而非平均值的问题
---
 diffulex/engine/tp_worker.py | 63 ++++++++++++++++++++++++++++++------
 1 file changed, 53 insertions(+), 10 deletions(-)

diff --git a/diffulex/engine/tp_worker.py b/diffulex/engine/tp_worker.py
index 0f46edf..6b6df33 100755
--- a/diffulex/engine/tp_worker.py
+++ b/diffulex/engine/tp_worker.py
@@ -102,21 +102,40 @@ def generate(
             sid = self.add_request(prompt, sp)
             seqid_to_idx[sid] = idx
         outputs = [None] * len(prompts)
-        prefill_throughput = decode_throughput = 0.
+        # Track per-step instantaneous throughput for display, and
+        # token/time totals for correct average throughput reporting.
+        last_prefill_throughput = 0.0
+        last_decode_throughput = 0.0
+        prefill_total_tokens = 0
+        decode_total_tokens = 0
+        prefill_total_time = 0.0
+        decode_total_time = 0.0
+        prefill_steps = 0
+        decode_steps = 0
         n_steps = 0
         n_diff_steps = [-1] * len(prompts)
         while not self.is_finished():
-            t = perf_counter()
             n_steps += 1
+            t = perf_counter()
             output, num_tokens, is_prefill, cur_n_diff_steps, _ = self.step()
+            dt = perf_counter() - t
+
+            # Accumulate totals to compute average throughput correctly.
+            if is_prefill:
+                prefill_steps += 1
+                prefill_total_tokens += int(num_tokens)
+                prefill_total_time += float(dt)
+                last_prefill_throughput = (num_tokens / dt) if dt > 0 else 0.0
+            else:
+                decode_steps += 1
+                decode_total_tokens += int(num_tokens)
+                decode_total_time += float(dt)
+                last_decode_throughput = (num_tokens / dt) if dt > 0 else 0.0
+
             if use_tqdm:
-                if is_prefill:
-                    prefill_throughput = num_tokens / (perf_counter() - t)
-                else:
-                    decode_throughput = num_tokens / (perf_counter() - t)
                 pbar.set_postfix({
-                    "Prefill": f"{int(prefill_throughput)}tok/s",
-                    "Decode": f"{int(decode_throughput)}tok/s",
+                    "Prefill": f"{int(last_prefill_throughput)}tok/s",
+                    "Decode": f"{int(last_decode_throughput)}tok/s",
                 })
             if cur_n_diff_steps:
                 for seq_id, n_step in cur_n_diff_steps.items():
@@ -128,9 +147,33 @@ def generate(
                 if use_tqdm:
                     pbar.update(1)
                     
+        avg_prefill_throughput = (
+            prefill_total_tokens / prefill_total_time if prefill_total_time > 0 else 0.0
+        )
+        avg_decode_throughput = (
+            decode_total_tokens / decode_total_time if decode_total_time > 0 else 0.0
+        )
+        avg_prefill_step_ms = (
+            (prefill_total_time / prefill_steps) * 1000.0 if prefill_steps > 0 else 0.0
+        )
+        avg_decode_step_ms = (
+            (decode_total_time / decode_steps) * 1000.0 if decode_steps > 0 else 0.0
+        )
         logger.info(
-            f"Finished in {n_steps} steps, prefill throughput: {prefill_throughput:.2f} tok/s, "
-            f"decode throughput: {decode_throughput:.2f} tok/s"
+            "Finished in %d steps (prefill=%d, decode=%d). "
+            "Prefill: %d tok in %.2fs (avg %.2f tok/s, %.2f ms/step). "
+            "Decode: %d tok in %.2fs (avg %.2f tok/s, %.2f ms/step).",
+            n_steps,
+            prefill_steps,
+            decode_steps,
+            prefill_total_tokens,
+            prefill_total_time,
+            avg_prefill_throughput,
+            avg_prefill_step_ms,
+            decode_total_tokens,
+            decode_total_time,
+            avg_decode_throughput,
+            avg_decode_step_ms,
         )
         # Ensure all outputs are present
         assert all(toks is not None for toks in outputs), "Some sequences did not produce outputs"

From 896b8dfe9f065208c305176fba06cbd32eba1c6f Mon Sep 17 00:00:00 2001
From: luozixin2 <lzx@diffulex.local>
Date: Sun, 25 Jan 2026 07:19:47 +0000
Subject: [PATCH 06/10] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96=E9=87=8F?=
 =?UTF-8?q?=E5=8C=96=20linear=20fast=20path=20=E5=B9=B6=E7=A7=BB=E9=99=A4?=
 =?UTF-8?q?=20profiler=20=E6=A0=87=E6=B3=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 量化 linear：去 kwargs/pop/重复可用性检查，缓存 out_features 与必要中间张量
- 直连 vLLM CUDA ops（W8A8/GPTQ/AWQ/Marlin 等）以降低 Python glue 开销
- load-time 处理 qweight/scales 的布局与 contiguous，避免 forward 里重复处理
- 移除 linear.py 中 profiler record 标注，保持代码简洁
- 补充 trace/profile 辅助分析脚本与相关测试
---
 diffulex/engine/tp_worker.py                  |   17 +-
 diffulex/layer/linear.py                      |  206 +++-
 diffulex/sampler/dream.py                     |    8 +-
 diffulex/sampler/fast_dllm_v2.py              |   16 +-
 diffulex/sampler/llada.py                     |    7 +-
 diffulex/sampler/sdar.py                      |   16 +-
 diffulex/strategy/d2f/engine/scheduler.py     |   11 +-
 diffulex/strategy/d2f/engine/sequence.py      |    4 +-
 .../strategies/linear_awq_marlin_w4a16.py     |  109 +-
 .../strategies/linear_awq_w4a16.py            |   54 +-
 .../strategies/linear_gptq_marlin_w4a16.py    |  142 ++-
 .../strategies/linear_gptq_w4a16.py           |  101 +-
 .../strategies/linear_int8_w8a8.py            |   64 +-
 .../strategies/linear_marlin_int8_w8a16.py    |  209 ++--
 .../python/dllm_flash_attn_kernels.py         | 1000 +++--------------
 .../dllm_flash_attn_prefill_tilelang.py       |  250 +++++
 .../python/paged_attn_decode_triton.py        |  661 +++++++++++
 profile/analyze_trace_bottlenecks.py          |  298 +++++
 profile/analyze_trace_cpu_ops.py              |  149 +++
 profile/analyze_trace_gemm_shapes.py          |  309 +++++
 .../kernel/test_paged_attn_decode_triton.py   |  240 ++++
 21 files changed, 2724 insertions(+), 1147 deletions(-)
 create mode 100644 diffulex_kernel/python/dllm_flash_attn_prefill_tilelang.py
 create mode 100644 diffulex_kernel/python/paged_attn_decode_triton.py
 create mode 100644 profile/analyze_trace_bottlenecks.py
 create mode 100644 profile/analyze_trace_cpu_ops.py
 create mode 100644 profile/analyze_trace_gemm_shapes.py
 create mode 100644 test/python/kernel/test_paged_attn_decode_triton.py

diff --git a/diffulex/engine/tp_worker.py b/diffulex/engine/tp_worker.py
index 6b6df33..ba65d67 100755
--- a/diffulex/engine/tp_worker.py
+++ b/diffulex/engine/tp_worker.py
@@ -102,10 +102,7 @@ def generate(
             sid = self.add_request(prompt, sp)
             seqid_to_idx[sid] = idx
         outputs = [None] * len(prompts)
-        # Track per-step instantaneous throughput for display, and
-        # token/time totals for correct average throughput reporting.
-        last_prefill_throughput = 0.0
-        last_decode_throughput = 0.0
+        # Track token/time totals for correct average throughput reporting.
         prefill_total_tokens = 0
         decode_total_tokens = 0
         prefill_total_time = 0.0
@@ -125,17 +122,21 @@ def generate(
                 prefill_steps += 1
                 prefill_total_tokens += int(num_tokens)
                 prefill_total_time += float(dt)
-                last_prefill_throughput = (num_tokens / dt) if dt > 0 else 0.0
             else:
                 decode_steps += 1
                 decode_total_tokens += int(num_tokens)
                 decode_total_time += float(dt)
-                last_decode_throughput = (num_tokens / dt) if dt > 0 else 0.0
 
             if use_tqdm:
+                avg_prefill_throughput = (
+                    prefill_total_tokens / prefill_total_time if prefill_total_time > 0 else 0.0
+                )
+                avg_decode_throughput = (
+                    decode_total_tokens / decode_total_time if decode_total_time > 0 else 0.0
+                )
                 pbar.set_postfix({
-                    "Prefill": f"{int(last_prefill_throughput)}tok/s",
-                    "Decode": f"{int(last_decode_throughput)}tok/s",
+                    "Prefill(avg)": f"{int(avg_prefill_throughput)}tok/s",
+                    "Decode(avg)": f"{int(avg_decode_throughput)}tok/s",
                 })
             if cur_n_diff_steps:
                 for seq_id, n_step in cur_n_diff_steps.items():
diff --git a/diffulex/layer/linear.py b/diffulex/layer/linear.py
index f26566d..e3581e9 100755
--- a/diffulex/layer/linear.py
+++ b/diffulex/layer/linear.py
@@ -78,6 +78,9 @@ def __init__(
         super().__init__()
         self.input_size = input_size
         self.output_size = output_size
+        # Cache forward output features (avoid per-call inference).
+        # Subclasses with TP partitions should overwrite this after partition sizes are known.
+        self._forward_out_features: int = int(output_size)
         self.tp_dim = tp_dim
         self.quant_kind = (quant_kind or "other").strip().lower() or "other"
         self.tp_rank = dist.get_rank()
@@ -86,6 +89,8 @@ def __init__(
         # NOTE: We keep these as buffers so they move with the module and do not appear as Parameters.
         self.register_buffer("quant_weight_int8", torch.empty(0, dtype=torch.int8), persistent=False)
         self.register_buffer("quant_scales", torch.empty(0, dtype=torch.bfloat16), persistent=False)
+        # Cache a 1xN view of scales to avoid per-call view/shape handling on hot paths.
+        self.register_buffer("quant_scales_1xn", torch.empty(0, dtype=torch.bfloat16), persistent=False)
         self.register_buffer("_weight_is_quantized", torch.tensor(False, dtype=torch.bool), persistent=False)
         
         # GPTQ/AWQ offline quantized weight storage (W4A16).
@@ -243,6 +248,13 @@ def _infer_module_device() -> torch.device:
         if g_idx is not None and g_idx.device != module_device:
             g_idx = g_idx.to(device=module_device)
 
+        # Make packed tensors contiguous once at load-time (avoid per-call checks/copies).
+        qweight = qweight.contiguous()
+        qzeros = qzeros.contiguous()
+        scales = scales.contiguous()
+        if g_idx is not None:
+            g_idx = g_idx.contiguous()
+
         # group_size == -1 means channelwise in some ecosystems; vLLM normalizes -1 to K.
         group_size_norm = in_features if group_size == -1 else group_size
         if group_size_norm <= 0 or (in_features % group_size_norm != 0):
@@ -458,8 +470,8 @@ def _maybe_prepare_offline_gptq_marlin(self, x: torch.Tensor) -> None:
         # g_idx (act-order) handling: marlin expects sorted g_idx + sort indices; otherwise empty.
         if self.gptq_g_idx.numel() > 0:
             g_idx_sorted, g_idx_sort_indices = marlin_sort_g_idx(self.gptq_g_idx.to(device=device, dtype=torch.int32))
-            self.gptq_marlin_g_idx = g_idx_sorted
-            self.gptq_marlin_g_idx_sort_indices = g_idx_sort_indices
+            self.gptq_marlin_g_idx = g_idx_sorted.contiguous()
+            self.gptq_marlin_g_idx_sort_indices = g_idx_sort_indices.contiguous()
         else:
             self.gptq_marlin_g_idx = marlin_make_empty_g_idx(device)
             self.gptq_marlin_g_idx_sort_indices = marlin_make_empty_g_idx(device)
@@ -476,7 +488,7 @@ def _maybe_prepare_offline_gptq_marlin(self, x: torch.Tensor) -> None:
                 size_n=out_features,
                 num_bits=weight_bits,
                 is_a_8bit=False,
-            )
+            ).contiguous()
 
             # Permute scales to marlin format.
             self.gptq_marlin_scales = marlin_permute_scales(
@@ -485,7 +497,7 @@ def _maybe_prepare_offline_gptq_marlin(self, x: torch.Tensor) -> None:
                 size_n=out_features,
                 group_size=group_size,
                 is_a_8bit=False,
-            )
+            ).contiguous()
 
         # GPTQ Marlin only supports symmetric weights (no runtime zero-points).
         # Use empty zp to keep has_zp=False in the kernel.
@@ -542,30 +554,30 @@ def _maybe_prepare_offline_awq_marlin(self, x: torch.Tensor) -> None:
 
         # Repack qweight to marlin format.
         self.awq_marlin_qweight = ops.awq_marlin_repack(
-            self.awq_qweight,
+            self.awq_qweight.contiguous(),
             size_k=in_features,
             size_n=out_features,
             num_bits=weight_bits,
             is_a_8bit=False,
-        )
+        ).contiguous()
 
         # Permute scales to marlin format.
         self.awq_marlin_scales = marlin_permute_scales(
-            self.awq_scales,
+            self.awq_scales.contiguous(),
             size_k=in_features,
             size_n=out_features,
             group_size=group_size,
             is_a_8bit=False,
-        )
+        ).contiguous()
 
         # Convert zero-points to marlin format.
         self.awq_marlin_zp = awq_to_marlin_zero_points(
-            self.awq_qzeros,
+            self.awq_qzeros.contiguous(),
             size_k=num_groups,
             size_n=out_features,
             num_bits=weight_bits,
             is_a_8bit=False,
-        )
+        ).contiguous()
 
         self._awq_marlin_is_prepared = torch.tensor(True, dtype=torch.bool, device=device)
 
@@ -598,19 +610,39 @@ def set_quantized_weight(self, quant_weight_int8: torch.Tensor, quant_scales: to
         except Exception:
             strategy = None
         scale_dtype = torch.bfloat16
+        force_weight_contig = True
         if strategy is not None:
             weight_format = getattr(strategy, "linear_weight_format", None)
             act_format = getattr(strategy, "linear_act_format", None)
             # FP8 W8A16 uses float32 scales
             if weight_format in ("fp8_e4m3", "fp8_e5m2") and act_format == "bf16":
                 scale_dtype = torch.float32
-            # FP8 W8A8 and int8 W8A8 use float16 scales
-            elif act_format in ("int8", "fp8_e4m3", "fp8_e5m2"):
+            # W8A8 int8 uses float32 [1, N] weight scales in vLLM cutlass_scaled_mm path.
+            elif weight_format == "int8" and act_format == "int8":
+                scale_dtype = torch.float32
+                # vLLM CUTLASS scaled_mm expects int8 weight in KxN with stride(0)==1,
+                # which is typically produced as a transpose-view (non-contiguous).
+                # Do NOT force contiguous here; just avoid per-call conversions.
+                force_weight_contig = False
+            # FP8 W8A8 keeps float32 scales; also keep KxN transpose-view layout.
+            elif act_format in ("fp8_e4m3", "fp8_e5m2"):
+                scale_dtype = torch.float32
+                force_weight_contig = False
+            # Other int8/int4 mixed paths use float16 scales by default.
+            elif act_format == "int8":
                 scale_dtype = torch.float16
         if quant_scales.dtype != scale_dtype:
             quant_scales = quant_scales.to(dtype=scale_dtype)
+        # Make sure scales are contiguous once at load-time.
+        # NOTE: Some kernels require specific non-contiguous weight layouts (e.g., W8A8 KxN with stride(0)==1).
+        # We avoid per-call `is_contiguous/contiguous` checks while preserving required layouts.
+        if force_weight_contig:
+            quant_weight_int8 = quant_weight_int8.contiguous()
+        quant_scales = quant_scales.contiguous()
         self.quant_weight_int8 = quant_weight_int8
         self.quant_scales = quant_scales
+        # 1xN view for fused kernels expecting 2D scales.
+        self.quant_scales_1xn = quant_scales if quant_scales.dim() == 2 else quant_scales.view(1, -1)
         self._weight_is_quantized.fill_(True)
 
     def _maybe_promote_weight_to_quantized_at_runtime(
@@ -738,13 +770,13 @@ def _infer_gptq_weight_bits(self, *, in_features: int) -> int:
             raise RuntimeError(f"GPTQ bits 推断失败：pack_factor={pack_factor} 不满足 32%pack_factor==0")
         return 32 // pack_factor
 
-    def _maybe_int4_original_in_features_kwargs(self, strategy, x: torch.Tensor) -> dict:
+    def _maybe_int4_original_in_features_kwargs(self, strategy, x: torch.Tensor) -> Optional[dict]:
         """Some int4 kernels need original K (before packing)."""
         if strategy is None:
-            return {}
+            return None
         if getattr(strategy, "linear_weight_format", None) == "int4":
             return {"original_in_features": x.shape[1]}
-        return {}
+        return None
 
     def _build_offline_forward_kwargs(self, x: torch.Tensor, strategy) -> dict:
         """Build kwargs for offline GPTQ/AWQ (including Marlin variants)."""
@@ -830,10 +862,90 @@ def _forward_base(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.
         if self.has_offline_quantized_weight():
             if strategy is None:
                 raise RuntimeError("Offline quantized weight is present but no linear strategy is configured.")
+            weight_format = getattr(strategy, "linear_weight_format", None)
+            out_features, in_features, group_size = self._offline_meta()
+
+            # Avoid per-call kwargs dict construction on hot paths.
+            if weight_format == "gptq":
+                self._maybe_prepare_offline_gptq(x)
+                bits = self._infer_gptq_weight_bits(in_features=in_features)
+                return strategy.linear_forward(
+                    x,
+                    None,  # weight not used for offline quantized weights
+                    bias,
+                    quant_kind=self.quant_kind,
+                    gptq_qweight=self.gptq_qweight,
+                    gptq_qzeros=self.gptq_qzeros,
+                    gptq_scales=self.gptq_scales,
+                    gptq_g_idx=self.gptq_g_idx,
+                    weight_bits=bits,
+                    use_v2_format=False,
+                    out_features=out_features,
+                    in_features=in_features,
+                    group_size=group_size,
+                )
+
+            if weight_format == "awq":
+                # AWQ is 4-bit only in vLLM; bits stored in _offline_quant_bits.
+                bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 4
+                pack_factor = 32 // max(1, bits)
+                return strategy.linear_forward(
+                    x,
+                    None,
+                    bias,
+                    quant_kind=self.quant_kind,
+                    awq_qweight=self.awq_qweight,
+                    awq_qzeros=self.awq_qzeros,
+                    awq_scales=self.awq_scales,
+                    pack_factor=pack_factor,
+                    out_features=out_features,
+                    in_features=in_features,
+                    group_size=group_size,
+                )
+
+            if weight_format == "gptq_marlin":
+                self._maybe_prepare_offline_gptq_marlin(x)
+                bits = self._infer_gptq_weight_bits(in_features=in_features)
+                return strategy.linear_forward(
+                    x,
+                    None,
+                    bias,
+                    quant_kind=self.quant_kind,
+                    qweight=self.gptq_marlin_qweight,
+                    scales=self.gptq_marlin_scales,
+                    zp=self.gptq_marlin_zp,
+                    g_idx=self.gptq_marlin_g_idx,
+                    g_idx_sort_indices=self.gptq_marlin_g_idx_sort_indices,
+                    workspace=self.gptq_marlin_workspace,
+                    in_features=in_features,
+                    out_features=out_features,
+                    group_size=group_size,
+                    weight_bits=bits,
+                    tp_dim=self.tp_dim,
+                )
+
+            if weight_format == "awq_marlin":
+                self._maybe_prepare_offline_awq_marlin(x)
+                return strategy.linear_forward(
+                    x,
+                    None,
+                    bias,
+                    quant_kind=self.quant_kind,
+                    qweight=self.awq_marlin_qweight,
+                    scales=self.awq_marlin_scales,
+                    zp=self.awq_marlin_zp,
+                    workspace=self.awq_marlin_workspace,
+                    in_features=in_features,
+                    out_features=out_features,
+                    group_size=group_size,
+                    tp_dim=self.tp_dim,
+                )
+
+            # Fallback: compatibility for any remaining strategies.
             kwargs = self._build_offline_forward_kwargs(x, strategy)
             return strategy.linear_forward(
                 x,
-                None,  # weight not used for offline quantized weights
+                None,
                 bias,
                 quant_kind=self.quant_kind,
                 **kwargs,
@@ -842,14 +954,65 @@ def _forward_base(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.
         if self.has_quantized_weight():
             if strategy is None:
                 raise RuntimeError("Quantized weight is present but no linear strategy is configured.")
-            kwargs = {"quant_scales": self.quant_scales}
-            kwargs.update(self._maybe_int4_original_in_features_kwargs(strategy, x))
+            # Hot path: avoid per-call dict construction when possible.
+            extra_kwargs = self._maybe_int4_original_in_features_kwargs(strategy, x)
+            # W8A16(AllSpark) expects scales in 1xN layout and needs explicit N.
+            if getattr(strategy, "name", "") == "linear_int8_w8a16":
+                if extra_kwargs:
+                    return strategy.linear_forward(
+                        x,
+                        self.quant_weight_int8,
+                        bias,
+                        quant_kind=self.quant_kind,
+                        quant_scales=self.quant_scales_1xn,
+                        out_features=self._forward_out_features,
+                        **extra_kwargs,
+                    )
+                return strategy.linear_forward(
+                    x,
+                    self.quant_weight_int8,
+                    bias,
+                    quant_kind=self.quant_kind,
+                    quant_scales=self.quant_scales_1xn,
+                    out_features=self._forward_out_features,
+                )
+
+            # W8A8 expects scales in 1xN layout and is sensitive to weight layout (KxN stride0==1).
+            if getattr(strategy, "name", "") == "linear_int8_w8a8":
+                if extra_kwargs:
+                    return strategy.linear_forward(
+                        x,
+                        self.quant_weight_int8,
+                        bias,
+                        quant_kind=self.quant_kind,
+                        quant_scales=self.quant_scales_1xn,
+                        out_features=self._forward_out_features,
+                        **extra_kwargs,
+                    )
+                return strategy.linear_forward(
+                    x,
+                    self.quant_weight_int8,
+                    bias,
+                    quant_kind=self.quant_kind,
+                    quant_scales=self.quant_scales_1xn,
+                    out_features=self._forward_out_features,
+                )
+
+            if extra_kwargs:
+                return strategy.linear_forward(
+                    x,
+                    self.quant_weight_int8,
+                    bias,
+                    quant_kind=self.quant_kind,
+                    quant_scales=self.quant_scales,
+                    **extra_kwargs,
+                )
             return strategy.linear_forward(
                 x,
                 self.quant_weight_int8,
                 bias,
                 quant_kind=self.quant_kind,
-                **kwargs,
+                quant_scales=self.quant_scales,
             )
 
         if strategy is None:
@@ -862,7 +1025,9 @@ def _forward_base(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.
         if weight is None:
             raise RuntimeError("Strategy is configured but weight is missing (expected bf16 weight).")
         kwargs = self._maybe_int4_original_in_features_kwargs(strategy, x)
-        return strategy.linear_forward(x, weight, bias, quant_kind=self.quant_kind, **kwargs)
+        if kwargs:
+            return strategy.linear_forward(x, weight, bias, quant_kind=self.quant_kind, **kwargs)
+        return strategy.linear_forward(x, weight, bias, quant_kind=self.quant_kind)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         raise NotImplementedError
@@ -915,6 +1080,7 @@ def __init__(
         LinearBase.__init__(self, input_size, output_size, 0, quant_kind)
         self.input_size_per_partition = input_size
         self.output_size_per_partition = divide(output_size, self.tp_size)
+        self._forward_out_features = int(self.output_size_per_partition)
 
         self.weight = nn.Parameter(torch.empty(self.output_size_per_partition, self.input_size))
         self.weight.weight_loader = self.weight_loader
diff --git a/diffulex/sampler/dream.py b/diffulex/sampler/dream.py
index 9f06340..1ff85c6 100644
--- a/diffulex/sampler/dream.py
+++ b/diffulex/sampler/dream.py
@@ -56,9 +56,11 @@ def forward(self, logits: torch.Tensor, temperatures: torch.Tensor,
                     high_conf_indices = torch.where(initial_confidence > block.accept_threshold)[0]
                     accepted_ids = high_conf_indices
 
-                true_local_ids_sub_map[str(block_id)] = [block.local_mask_token_ids[accepted_id] for accepted_id in accepted_ids.tolist()]
-                accepted_ids_sub_map[str(block_id)] = accepted_ids.tolist()
-                sampled_tokens_sub_map[str(block_id)] = sampled_tokens
+                # Avoid calling `.tolist()` on CUDA tensors directly (can trigger many per-element DtoH syncs).
+                accepted_ids_list = accepted_ids.to(device="cpu").tolist()
+                true_local_ids_sub_map[str(block_id)] = [block.local_mask_token_ids[i] for i in accepted_ids_list]
+                accepted_ids_sub_map[str(block_id)] = accepted_ids_list
+                sampled_tokens_sub_map[str(block_id)] = sampled_tokens.to(device="cpu").tolist()
             
             seq_idx = str(seq.seq_id)
             true_local_ids_map[seq_idx] = true_local_ids_sub_map
diff --git a/diffulex/sampler/fast_dllm_v2.py b/diffulex/sampler/fast_dllm_v2.py
index ec323b5..5726655 100644
--- a/diffulex/sampler/fast_dllm_v2.py
+++ b/diffulex/sampler/fast_dllm_v2.py
@@ -59,19 +59,15 @@ def forward(self, seqs: list[SequenceBase], logits: torch.Tensor, temperatures:
                 
                 if len(high_conf_indices) == 0:
                     max_prob_idx = initial_confidence.argmax()
-                    accepted_ids = torch.tensor([max_prob_idx], device=sampled_tokens.device, dtype=torch.long)
+                    accepted_ids = max_prob_idx.view(1)
                 else:
                     max_prob_idx = initial_confidence.argmax()
-                    accepted_ids = torch.unique(torch.cat([
-                        high_conf_indices,
-                        torch.tensor([max_prob_idx], device=sampled_tokens.device, dtype=torch.long)
-                    ]))
+                    accepted_ids = torch.unique(torch.cat([high_conf_indices, max_prob_idx.view(1)]))
                 
-                true_local_ids_sub_map[str(block_id)] = [
-                    block.local_mask_token_ids[accepted_id] for accepted_id in accepted_ids.tolist()
-                ]
-                accepted_ids_sub_map[str(block_id)] = accepted_ids.tolist()
-                sampled_tokens_sub_map[str(block_id)] = sampled_tokens
+                accepted_ids_list = accepted_ids.to(device="cpu").tolist()
+                true_local_ids_sub_map[str(block_id)] = [block.local_mask_token_ids[i] for i in accepted_ids_list]
+                accepted_ids_sub_map[str(block_id)] = accepted_ids_list
+                sampled_tokens_sub_map[str(block_id)] = sampled_tokens.to(device="cpu").tolist()
             
             seq_idx = str(seq.seq_id)
             true_local_ids_map[seq_idx] = true_local_ids_sub_map
diff --git a/diffulex/sampler/llada.py b/diffulex/sampler/llada.py
index 5202fa1..fd11f44 100644
--- a/diffulex/sampler/llada.py
+++ b/diffulex/sampler/llada.py
@@ -52,9 +52,10 @@ def forward(self, logits: torch.Tensor, temperatures: torch.Tensor,
                     high_conf_indices = torch.where(initial_confidence > block.accept_threshold)[0]
                     accepted_ids = high_conf_indices
 
-                true_local_ids_sub_map[str(block_id)] = [block.local_mask_token_ids[accepted_id] for accepted_id in accepted_ids.tolist()]
-                accepted_ids_sub_map[str(block_id)] = accepted_ids.tolist()
-                sampled_tokens_sub_map[str(block_id)] = sampled_tokens
+                accepted_ids_list = accepted_ids.to(device="cpu").tolist()
+                true_local_ids_sub_map[str(block_id)] = [block.local_mask_token_ids[i] for i in accepted_ids_list]
+                accepted_ids_sub_map[str(block_id)] = accepted_ids_list
+                sampled_tokens_sub_map[str(block_id)] = sampled_tokens.to(device="cpu").tolist()
             
             seq_idx = str(seq.seq_id)
             true_local_ids_map[seq_idx] = true_local_ids_sub_map
diff --git a/diffulex/sampler/sdar.py b/diffulex/sampler/sdar.py
index 4eeb471..8fc3896 100644
--- a/diffulex/sampler/sdar.py
+++ b/diffulex/sampler/sdar.py
@@ -59,19 +59,15 @@ def forward(self, seqs: list[SequenceBase], logits: torch.Tensor, temperatures:
                 
                 if len(high_conf_indices) == 0:
                     max_prob_idx = initial_confidence.argmax()
-                    accepted_ids = torch.tensor([max_prob_idx], device=sampled_tokens.device, dtype=torch.long)
+                    accepted_ids = max_prob_idx.view(1)
                 else:
                     max_prob_idx = initial_confidence.argmax()
-                    accepted_ids = torch.unique(torch.cat([
-                        high_conf_indices,
-                        torch.tensor([max_prob_idx], device=sampled_tokens.device, dtype=torch.long)
-                    ]))
+                    accepted_ids = torch.unique(torch.cat([high_conf_indices, max_prob_idx.view(1)]))
                 
-                true_local_ids_sub_map[str(block_id)] = [
-                    block.local_mask_token_ids[accepted_id] for accepted_id in accepted_ids.tolist()
-                ]
-                accepted_ids_sub_map[str(block_id)] = accepted_ids.tolist()
-                sampled_tokens_sub_map[str(block_id)] = sampled_tokens
+                accepted_ids_list = accepted_ids.to(device="cpu").tolist()
+                true_local_ids_sub_map[str(block_id)] = [block.local_mask_token_ids[i] for i in accepted_ids_list]
+                accepted_ids_sub_map[str(block_id)] = accepted_ids_list
+                sampled_tokens_sub_map[str(block_id)] = sampled_tokens.to(device="cpu").tolist()
             
             seq_idx = str(seq.seq_id)
             true_local_ids_map[seq_idx] = true_local_ids_sub_map
diff --git a/diffulex/strategy/d2f/engine/scheduler.py b/diffulex/strategy/d2f/engine/scheduler.py
index a4b8f29..d362dda 100644
--- a/diffulex/strategy/d2f/engine/scheduler.py
+++ b/diffulex/strategy/d2f/engine/scheduler.py
@@ -5,6 +5,8 @@
 from diffulex.engine.sequence import SequenceStatus
 from .sequence import D2FSequence
 
+import torch
+
 
 @AutoScheduler.register("d2f", is_default=True)
 class D2FScheduler(SchedulerBase):
@@ -104,12 +106,17 @@ def postprocess(
                     continue
                 diffusion_block = seq.diffusion_blocks[int(block_id)]
                 sampled_tokens = sampled_tokens_map.get(block_id, [])
+                # `sampled_tokens` may be a CUDA Tensor (legacy behavior) or list[int].
+                # Converting per-token via `.item()` causes massive DtoH sync overhead.
+                # Convert once per block.
+                if isinstance(sampled_tokens, torch.Tensor):
+                    sampled_tokens = sampled_tokens.tolist()
                 true_local_ids = true_ids_map.get(block_id, [])
                 for true_local_id, accepted_id in zip(true_local_ids, accepted_ids):
-                    token = sampled_tokens[accepted_id]
+                    token = int(sampled_tokens[accepted_id])
                     diffusion_block.modify_token(true_local_id, token)
                     if (
-                        (not seq.ignore_eos and token.item() == self.eos)
+                        (not seq.ignore_eos and token == self.eos)
                         or seq.num_completion_tokens >= seq.max_tokens
                     ):
                         seq.meet_eos = True
diff --git a/diffulex/strategy/d2f/engine/sequence.py b/diffulex/strategy/d2f/engine/sequence.py
index db22bc8..7532ea8 100644
--- a/diffulex/strategy/d2f/engine/sequence.py
+++ b/diffulex/strategy/d2f/engine/sequence.py
@@ -117,7 +117,9 @@ def modify_token(self, local_token_id: int, modified_to: int) -> None:
             raise RuntimeError("Diffusion block is not attached to a sequence.")
         target_id = local_token_id + self.global_start_id
         assert self.seq.token_ids[target_id] == self.mask_token_id
-        self.seq.token_ids[target_id] = modified_to.item()  # type: ignore[assignment]
+        # Hot path: avoid per-token CUDA -> CPU sync via Tensor.item().
+        # `modified_to` should be a python int (or at least int-castable).
+        self.seq.token_ids[target_id] = int(modified_to)  # type: ignore[assignment]
         self.seq.new_tokens += 1
 
 
diff --git a/diffulex/utils/quantization/strategies/linear_awq_marlin_w4a16.py b/diffulex/utils/quantization/strategies/linear_awq_marlin_w4a16.py
index be9389f..bb19518 100644
--- a/diffulex/utils/quantization/strategies/linear_awq_marlin_w4a16.py
+++ b/diffulex/utils/quantization/strategies/linear_awq_marlin_w4a16.py
@@ -21,12 +21,14 @@
     from vllm.model_executor.layers.quantization.utils.marlin_utils import (  # type: ignore
         apply_awq_marlin_linear,
         marlin_make_empty_g_idx,
+        should_use_atomic_add_reduce,
         marlin_permute_bias,
     )
     from vllm.scalar_type import scalar_types  # type: ignore
 except Exception:  # pragma: no cover
     apply_awq_marlin_linear = None  # type: ignore
     marlin_make_empty_g_idx = None  # type: ignore
+    should_use_atomic_add_reduce = None  # type: ignore
     marlin_permute_bias = None  # type: ignore
     scalar_types = None  # type: ignore
 
@@ -37,6 +39,13 @@ def _build_linear_awq_marlin_w4a16() -> LinearQuantizationStrategy:
 
 
 class LinearAWQMarlinW4A16Strategy(LinearQuantizationStrategy):
+    def __init__(self) -> None:
+        super().__init__()
+        self._available: bool = bool(apply_awq_marlin_linear is not None and scalar_types is not None)
+        self._empty_cache: dict[int, torch.Tensor] = {}
+        self._bias_cache: dict[tuple[int, int], torch.Tensor] = {}
+        self._atomic_add_cache: dict[tuple[int, int, int, int, int], bool] = {}
+
     @property
     def name(self) -> str:
         return "linear_awq_marlin_w4a16"
@@ -75,49 +84,83 @@ def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs)
     def linear_forward(
         self,
         x: torch.Tensor,
-        weight: torch.Tensor,
+        weight: Optional[torch.Tensor],
         bias: Optional[torch.Tensor],
         *,
         quant_kind: str,
-        **kwargs: Any,
+        qweight: torch.Tensor,
+        scales: torch.Tensor,
+        zp: torch.Tensor,
+        workspace: Optional[torch.Tensor] = None,
+        in_features: int = 0,
+        out_features: int = 0,
+        group_size: int = 128,
+        tp_dim: Optional[int] = None,
     ) -> torch.Tensor:
-        _ = quant_kind, weight
-        if apply_awq_marlin_linear is None or scalar_types is None:
+        _ = quant_kind, weight, group_size, tp_dim
+        if not self._available or workspace is None:
             raise RuntimeError("awq_marlin 需要 vLLM (marlin_utils + scalar_types)；当前环境不可用。")
+        if in_features <= 0 or out_features <= 0:
+            raise RuntimeError("awq_marlin: missing in_features/out_features.")
 
-        qweight = kwargs.get("awq_marlin_qweight", None)
-        scales = kwargs.get("awq_marlin_scales", None)
-        zp = kwargs.get("awq_marlin_zp", None)
-        workspace = kwargs.get("awq_marlin_workspace", None)
-        in_features = int(kwargs.get("in_features", 0))
-        out_features = int(kwargs.get("out_features", 0))
-
-        if any(t is None for t in (qweight, scales, zp, workspace)) or in_features <= 0 or out_features <= 0:
-            raise RuntimeError("awq_marlin: missing prepared marlin tensors (qweight/scales/zp/workspace).")
-
-        # vLLM marlin kernels expect FP16 activations.
-        x_in = x.to(dtype=torch.float16) if x.dtype != torch.float16 else x
+        device = x.device
+        dev_key = int(device.index) if device.type == "cuda" and device.index is not None else -1
 
-        # AWQ marlin does not use g_idx.
-        empty = marlin_make_empty_g_idx(x.device) if marlin_make_empty_g_idx is not None else torch.empty((0,), device=x.device, dtype=torch.int32)
+        # AWQ marlin does not use g_idx/perm; pass empty tensors (cached).
+        empty = self._empty_cache.get(dev_key)
+        if empty is None:
+            empty = marlin_make_empty_g_idx(device) if marlin_make_empty_g_idx is not None else torch.empty((0,), device=device, dtype=torch.int32)
+            self._empty_cache[dev_key] = empty
 
+        # Cache permuted bias.
         marlin_bias = None
         if bias is not None:
-            marlin_bias = marlin_permute_bias(bias) if marlin_permute_bias is not None else bias
-
-        out = apply_awq_marlin_linear(
-            input=x_in,
-            weight=qweight,
-            weight_scale=scales,
-            weight_zp=zp,
-            g_idx=empty,
-            g_idx_sort_indices=empty,
-            workspace=workspace,
-            quant_type=scalar_types.uint4,
-            output_size_per_partition=out_features,
-            input_size_per_partition=in_features,
-            bias=marlin_bias,
-            input_dtype=None,
+            bkey = (dev_key, int(bias.data_ptr()))
+            marlin_bias = self._bias_cache.get(bkey)
+            if marlin_bias is None:
+                marlin_bias = marlin_permute_bias(bias) if marlin_permute_bias is not None else bias
+                self._bias_cache[bkey] = marlin_bias
+
+        reshaped_x = x.reshape(-1, x.shape[-1])
+        out_shape = x.shape[:-1] + (int(out_features),)
+        m = int(reshaped_x.shape[0])
+        n = int(out_features)
+        k = int(reshaped_x.shape[1])
+        dtype_id = 1 if reshaped_x.dtype == torch.bfloat16 else (2 if reshaped_x.dtype == torch.float16 else 0)
+        use_atomic_add = False
+        if should_use_atomic_add_reduce is not None:
+            akey = (dev_key, dtype_id, m, n, k)
+            cached = self._atomic_add_cache.get(akey)
+            if cached is None:
+                cached = bool(
+                    should_use_atomic_add_reduce(
+                        m=m, n=n, k=k, device=device, dtype=reshaped_x.dtype
+                    )
+                )
+                self._atomic_add_cache[akey] = cached
+            use_atomic_add = cached
+
+        out = torch.ops._C.gptq_marlin_gemm(
+            reshaped_x,
+            None,
+            qweight,
+            marlin_bias,
+            scales,
+            None,
+            None,
+            zp,
+            empty,
+            empty,
+            workspace,
+            scalar_types.uint4.id,
+            m,
+            n,
+            k,
+            True,  # is_k_full
+            use_atomic_add,
+            True,  # use_fp32_reduce
+            False,  # is_zp_float
         )
+        out = out.reshape(out_shape)
         return out.to(dtype=x.dtype) if out.dtype != x.dtype else out
 
diff --git a/diffulex/utils/quantization/strategies/linear_awq_w4a16.py b/diffulex/utils/quantization/strategies/linear_awq_w4a16.py
index 488176e..22295fa 100644
--- a/diffulex/utils/quantization/strategies/linear_awq_w4a16.py
+++ b/diffulex/utils/quantization/strategies/linear_awq_w4a16.py
@@ -30,6 +30,10 @@ def _build_linear_awq_w4a16() -> LinearQuantizationStrategy:
 
 
 class LinearAWQW4A16Strategy(LinearQuantizationStrategy):
+    def __init__(self) -> None:
+        super().__init__()
+        self._ops_available: bool = bool(ops is not None and hasattr(torch.ops, "_C") and hasattr(torch.ops._C, "awq_gemm"))
+
     @property
     def name(self) -> str:
         return "linear_awq_w4a16"
@@ -73,47 +77,47 @@ def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs)
     def linear_forward(
         self,
         x: torch.Tensor,
-        weight: torch.Tensor,
+        weight: Optional[torch.Tensor],
         bias: Optional[torch.Tensor],
         *,
         quant_kind: str,
-        **kwargs: Any,
+        awq_qweight: Optional[torch.Tensor] = None,
+        awq_qzeros: Optional[torch.Tensor] = None,
+        awq_scales: Optional[torch.Tensor] = None,
+        pack_factor: int = 8,
+        out_features: Optional[int] = None,
+        in_features: Optional[int] = None,
+        group_size: int = 128,
     ) -> torch.Tensor:
-        _ = quant_kind, weight
-        if ops is None:
+        _ = quant_kind, weight, pack_factor, in_features, group_size
+        if not self._ops_available:
             raise RuntimeError(
                 "vLLM is required for AWQ W4A16 (missing `vllm._custom_ops`). "
                 "Please install/build vLLM with CUDA ops."
             )
-
-        qweight = kwargs.get("awq_qweight", None)
-        qzeros = kwargs.get("awq_qzeros", None)
-        scales = kwargs.get("awq_scales", None)
-
+        qweight = awq_qweight
+        qzeros = awq_qzeros
+        scales = awq_scales
         if qweight is None or qzeros is None or scales is None:
+            if weight is None:
+                raise RuntimeError("AWQ offline weights missing packed tensors and bf16 weight is not present.")
             return F.linear(x, weight, bias)
 
-        # Infer pack_factor from packed shapes to avoid hard-coding 4-bit.
-        # AWQ: qweight [K, N/pack], scales [K/group, N]
-        if scales.ndim != 2 or scales.shape[1] <= 0:
-            raise RuntimeError(f"Invalid AWQ scales shape: {tuple(scales.shape)}")
-        if qweight.shape[1] <= 0 or int(scales.shape[1]) % int(qweight.shape[1]) != 0:
-            raise RuntimeError(
-                f"Invalid AWQ packed shapes: qweight.shape={tuple(qweight.shape)}, "
-                f"scales.shape={tuple(scales.shape)}"
-            )
-        pack_factor = int(scales.shape[1]) // int(qweight.shape[1])
         # vLLM AWQ kernels expect FP16 activations.
-        x_in = x.to(dtype=torch.float16) if x.dtype != torch.float16 else x
-        qweight = qweight.to(device=x.device, dtype=torch.int32)
-        qzeros = qzeros.to(device=x.device, dtype=torch.int32)
-        scales = scales.to(device=x.device, dtype=torch.float16)
+        x_in = x if x.dtype == torch.float16 else x.to(dtype=torch.float16)
 
-        out_shape = x.shape[:-1] + (qweight.shape[-1] * pack_factor,)
+        # Use known out_features if provided (avoid per-call inference).
+        n = int(out_features) if out_features is not None else int(scales.shape[1])
+        out_shape = x.shape[:-1] + (n,)
         reshaped_x = x_in.reshape(-1, x_in.shape[-1])
 
         # Always use awq_gemm to avoid large temporary dequantized weight allocations.
-        out = ops.awq_gemm(reshaped_x, qweight, scales, qzeros, pack_factor)
+        # vLLM API: awq_gemm(input, qweight, qzeros, scales, split_k_iters)
+        split_k_iters = 1
+        if reshaped_x.is_contiguous() and qweight.is_contiguous() and qzeros.is_contiguous() and scales.is_contiguous():
+            out = torch.ops._C.awq_gemm(reshaped_x, qweight, qzeros, scales, split_k_iters)
+        else:
+            out = ops.awq_gemm(reshaped_x, qweight, qzeros, scales, split_k_iters)
 
         if bias is not None:
             out.add_(bias.to(dtype=out.dtype))
diff --git a/diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py b/diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py
index c544166..1425c85 100644
--- a/diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py
+++ b/diffulex/utils/quantization/strategies/linear_gptq_marlin_w4a16.py
@@ -27,6 +27,7 @@
         apply_gptq_marlin_linear,
         marlin_is_k_full,
         marlin_make_empty_g_idx,
+        should_use_atomic_add_reduce,
         marlin_permute_bias,
     )
     from vllm.scalar_type import scalar_types  # type: ignore
@@ -34,6 +35,7 @@
     apply_gptq_marlin_linear = None  # type: ignore
     marlin_is_k_full = None  # type: ignore
     marlin_make_empty_g_idx = None  # type: ignore
+    should_use_atomic_add_reduce = None  # type: ignore
     marlin_permute_bias = None  # type: ignore
     scalar_types = None  # type: ignore
 
@@ -44,6 +46,13 @@ def _build_linear_gptq_marlin_w4a16() -> LinearQuantizationStrategy:
 
 
 class LinearGPTQMarlinW4A16Strategy(LinearQuantizationStrategy):
+    def __init__(self) -> None:
+        super().__init__()
+        self._available: bool = bool(apply_gptq_marlin_linear is not None and scalar_types is not None)
+        self._empty_cache: dict[int, torch.Tensor] = {}
+        self._bias_cache: dict[tuple[int, int], torch.Tensor] = {}
+        self._atomic_add_cache: dict[tuple[int, int, int, int, int], bool] = {}
+
     @property
     def name(self) -> str:
         return "linear_gptq_marlin_w4a16"
@@ -82,28 +91,28 @@ def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs)
     def linear_forward(
         self,
         x: torch.Tensor,
-        weight: torch.Tensor,
+        weight: Optional[torch.Tensor],
         bias: Optional[torch.Tensor],
         *,
         quant_kind: str,
-        **kwargs: Any,
+        qweight: torch.Tensor,
+        scales: torch.Tensor,
+        zp: torch.Tensor,
+        g_idx: Optional[torch.Tensor] = None,
+        g_idx_sort_indices: Optional[torch.Tensor] = None,
+        workspace: Optional[torch.Tensor] = None,
+        in_features: int = 0,
+        out_features: int = 0,
+        group_size: int = 128,
+        weight_bits: int = 0,
+        tp_dim: Optional[int] = None,
     ) -> torch.Tensor:
-        _ = quant_kind, weight
-        if apply_gptq_marlin_linear is None or scalar_types is None:
+        _ = quant_kind, weight, group_size
+        if not self._available or workspace is None:
             raise RuntimeError("gptq_marlin 需要 vLLM (marlin_utils + scalar_types)；当前环境不可用。")
 
-        qweight = kwargs.get("gptq_marlin_qweight", None)
-        scales = kwargs.get("gptq_marlin_scales", None)
-        zp = kwargs.get("gptq_marlin_zp", None)
-        g_idx = kwargs.get("gptq_marlin_g_idx", None)
-        g_idx_sort_indices = kwargs.get("gptq_marlin_g_idx_sort_indices", None)
-        workspace = kwargs.get("gptq_marlin_workspace", None)
-        in_features = int(kwargs.get("in_features", 0))
-        out_features = int(kwargs.get("out_features", 0))
-        weight_bits = int(kwargs.get("gptq_weight_bits", 0))
-
-        if any(t is None for t in (qweight, scales, zp, workspace)) or in_features <= 0 or out_features <= 0:
-            raise RuntimeError("gptq_marlin: missing prepared marlin tensors (qweight/scales/zp/workspace).")
+        if in_features <= 0 or out_features <= 0:
+            raise RuntimeError("gptq_marlin: missing in_features/out_features.")
 
         if weight_bits == 4:
             wtype = scalar_types.uint4b8
@@ -112,45 +121,84 @@ def linear_forward(
         else:
             raise RuntimeError(f"gptq_marlin: unsupported weight_bits={weight_bits} (expected 4 or 8)")
 
-        # Align with vLLM Marlin: accept bf16/fp16 activations directly.
-        x_in = x
+        device = x.device
+        dev_key = int(device.index) if device.type == "cuda" and device.index is not None else -1
 
-        # g_idx can be empty (desc_act=False). Ensure correct dtype/device.
-        if g_idx is None or (isinstance(g_idx, torch.Tensor) and g_idx.numel() == 0):
-            g_idx_t = marlin_make_empty_g_idx(x.device) if marlin_make_empty_g_idx is not None else torch.empty((0,), device=x.device, dtype=torch.int32)
+        # g_idx can be empty (desc_act=False). Prefer already-correct tensors; avoid per-call to().
+        if g_idx is None or g_idx.numel() == 0:
+            empty = self._empty_cache.get(dev_key)
+            if empty is None:
+                empty = marlin_make_empty_g_idx(device) if marlin_make_empty_g_idx is not None else torch.empty((0,), device=device, dtype=torch.int32)
+                self._empty_cache[dev_key] = empty
+            g_idx_t = empty
         else:
-            g_idx_t = g_idx.to(device=x.device, dtype=torch.int32)
-        if g_idx_sort_indices is None or (isinstance(g_idx_sort_indices, torch.Tensor) and g_idx_sort_indices.numel() == 0):
-            g_idx_sort_t = marlin_make_empty_g_idx(x.device) if marlin_make_empty_g_idx is not None else torch.empty((0,), device=x.device, dtype=torch.int32)
+            g_idx_t = g_idx
+        if g_idx_sort_indices is None or g_idx_sort_indices.numel() == 0:
+            empty = self._empty_cache.get(dev_key)
+            if empty is None:
+                empty = marlin_make_empty_g_idx(device) if marlin_make_empty_g_idx is not None else torch.empty((0,), device=device, dtype=torch.int32)
+                self._empty_cache[dev_key] = empty
+            g_idx_sort_t = empty
         else:
-            g_idx_sort_t = g_idx_sort_indices.to(device=x.device, dtype=torch.int32)
+            g_idx_sort_t = g_idx_sort_indices
 
         # Determine whether K is full (needed by marlin kernel). Row-parallel layers set tp_dim=1 in Diffulex.
-        row_parallel = bool(kwargs.get("tp_dim", None) == 1)
+        row_parallel = bool(tp_dim == 1)
         has_g_idx = bool(g_idx_t.numel() > 0)
-        if marlin_is_k_full is None:
-            is_k_full = True
-        else:
-            is_k_full = marlin_is_k_full(has_g_idx, row_parallel)
+        is_k_full = True if marlin_is_k_full is None else marlin_is_k_full(has_g_idx, row_parallel)
 
+        # Cache permuted bias (Marlin expects permuted bias order).
         marlin_bias = None
         if bias is not None:
-            marlin_bias = marlin_permute_bias(bias) if marlin_permute_bias is not None else bias
-
-        out = apply_gptq_marlin_linear(
-            input=x_in,
-            weight=qweight,
-            weight_scale=scales,
-            weight_zp=zp,
-            g_idx=g_idx_t,
-            g_idx_sort_indices=g_idx_sort_t,
-            workspace=workspace,
-            wtype=wtype,
-            output_size_per_partition=out_features,
-            input_size_per_partition=in_features,
-            is_k_full=is_k_full,
-            bias=marlin_bias,
-            input_dtype=None,
+            bkey = (dev_key, int(bias.data_ptr()))
+            marlin_bias = self._bias_cache.get(bkey)
+            if marlin_bias is None:
+                marlin_bias = marlin_permute_bias(bias) if marlin_permute_bias is not None else bias
+                self._bias_cache[bkey] = marlin_bias
+
+        # Flatten like F.linear: [*,K] -> [M,K]
+        reshaped_x = x.reshape(-1, x.shape[-1])
+        out_shape = x.shape[:-1] + (int(out_features),)
+
+        # Cache heuristic for atomic-add reduction (depends on M/N/K, device, dtype).
+        m = int(reshaped_x.shape[0])
+        n = int(out_features)
+        k = int(reshaped_x.shape[1])
+        dtype_id = 1 if reshaped_x.dtype == torch.bfloat16 else (2 if reshaped_x.dtype == torch.float16 else 0)
+        use_atomic_add = False
+        if should_use_atomic_add_reduce is not None:
+            akey = (dev_key, dtype_id, m, n, k)
+            cached = self._atomic_add_cache.get(akey)
+            if cached is None:
+                cached = bool(
+                    should_use_atomic_add_reduce(
+                        m=m, n=n, k=k, device=device, dtype=reshaped_x.dtype
+                    )
+                )
+                self._atomic_add_cache[akey] = cached
+            use_atomic_add = cached
+
+        # Directly call the underlying CUDA op to minimize Python glue.
+        out = torch.ops._C.gptq_marlin_gemm(
+            reshaped_x,
+            None,
+            qweight,
+            marlin_bias,
+            scales,
+            None,
+            None,
+            zp,
+            g_idx_t,
+            g_idx_sort_t,
+            workspace,
+            wtype.id,
+            m,
+            n,
+            k,
+            is_k_full,
+            use_atomic_add,
+            True,  # use_fp32_reduce
+            False,  # is_zp_float
         )
-        return out
+        return out.reshape(out_shape)
 
diff --git a/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py b/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py
index 8fc67a5..f0a7a98 100644
--- a/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py
+++ b/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py
@@ -34,6 +34,10 @@ def _build_linear_gptq_w4a16() -> LinearQuantizationStrategy:
 
 
 class LinearGPTQW4A16Strategy(LinearQuantizationStrategy):
+    def __init__(self) -> None:
+        super().__init__()
+        self._ops_available: bool = bool(ops is not None and hasattr(torch.ops, "_C") and hasattr(torch.ops._C, "gptq_gemm"))
+
     @property
     def name(self) -> str:
         return "linear_gptq_w4a16"
@@ -77,67 +81,92 @@ def dequantize(self, quantized: torch.Tensor, scale_or_metadata: Any, **kwargs)
     def linear_forward(
         self,
         x: torch.Tensor,
-        weight: torch.Tensor,
+        weight: Optional[torch.Tensor],
         bias: Optional[torch.Tensor],
         *,
         quant_kind: str,
-        **kwargs: Any,
+        gptq_qweight: Optional[torch.Tensor] = None,
+        gptq_qzeros: Optional[torch.Tensor] = None,
+        gptq_scales: Optional[torch.Tensor] = None,
+        gptq_g_idx: Optional[torch.Tensor] = None,
+        weight_bits: int = 0,
+        use_v2_format: bool = False,
+        out_features: Optional[int] = None,
+        in_features: Optional[int] = None,
+        group_size: int = 128,
     ) -> torch.Tensor:
-        _ = quant_kind, weight
-        if ops is None:
+        _ = quant_kind, weight, in_features, group_size
+        if not self._ops_available:
             raise RuntimeError(
                 "vLLM is required for GPTQ W4A16 (missing `vllm._custom_ops`). "
                 "Please install/build vLLM with CUDA ops."
             )
-
-        qweight = kwargs.get("gptq_qweight", None)
-        qzeros = kwargs.get("gptq_qzeros", None)
-        scales = kwargs.get("gptq_scales", None)
-        g_idx = kwargs.get("gptq_g_idx", None)
+        qweight = gptq_qweight
+        qzeros = gptq_qzeros
+        scales = gptq_scales
+        g_idx = gptq_g_idx
 
         if qweight is None or qzeros is None or scales is None:
+            # correctness fallback (should not happen for offline GPTQ weights)
+            if weight is None:
+                raise RuntimeError("GPTQ offline weights missing packed tensors and bf16 weight is not present.")
             return F.linear(x, weight, bias)
 
-        use_v2_format = bool(kwargs.get("gptq_use_v2_format", False))
-
-        # Infer weight_bits from packed shapes to support GPTQ W2/W4/W8.
-        # qzeros: [K/group, N/pack_factor] and qweight: [K/pack_factor, N]
-        if qzeros.shape[1] <= 0 or qweight.shape[1] % int(qzeros.shape[1]) != 0:
-            raise RuntimeError(
-                f"Invalid GPTQ packed shapes: qweight.shape={tuple(qweight.shape)}, "
-                f"qzeros.shape={tuple(qzeros.shape)}"
-            )
-        pack_factor = int(qweight.shape[1]) // int(qzeros.shape[1])
-        if 32 % pack_factor != 0:
-            raise RuntimeError(
-                f"Unsupported GPTQ pack_factor={pack_factor} (requires 32%pack_factor==0). "
-                f"qweight.shape={tuple(qweight.shape)}, qzeros.shape={tuple(qzeros.shape)}"
-            )
-        weight_bits = 32 // pack_factor
-
         # vLLM GPTQ kernels expect FP16 activations.
-        x_in = x.to(dtype=torch.float16) if x.dtype != torch.float16 else x
-        qweight = qweight.to(device=x.device, dtype=torch.int32)
-        qzeros = qzeros.to(device=x.device, dtype=torch.int32)
-        scales = scales.to(device=x.device, dtype=torch.float16)
+        x_in = x if x.dtype == torch.float16 else x.to(dtype=torch.float16)
+
+        # ---- Fast path ----
+        if (
+            x_in.dim() == 2
+            and x_in.is_contiguous()
+            and qweight.device == x.device
+            and qzeros.device == x.device
+            and scales.device == x.device
+            and qweight.dtype == torch.int32
+            and qzeros.dtype == torch.int32
+            and scales.dtype == torch.float16
+            and qweight.is_contiguous()
+            and qzeros.is_contiguous()
+            and scales.is_contiguous()
+            and weight_bits > 0
+        ):
+            if g_idx is None or (isinstance(g_idx, torch.Tensor) and g_idx.numel() == 0):
+                g_idx_t = torch.empty((0,), device=x.device, dtype=torch.int)
+            else:
+                # Prefer already-correct dtype/device to avoid per-call copies.
+                g_idx_t = g_idx if (g_idx.device == x.device and g_idx.dtype == torch.int) else g_idx.to(device=x.device, dtype=torch.int)
+            n = int(out_features) if out_features is not None else int(qweight.shape[-1])
+            output = torch.ops._C.gptq_gemm(
+                x_in,
+                qweight,
+                qzeros,
+                scales,
+                g_idx_t,
+                True,
+                bool(use_v2_format),
+                int(weight_bits),
+            )
+            if bias is not None:
+                output.add_(bias.to(dtype=output.dtype))
+            # Output is [M,N]
+            return output.to(dtype=x.dtype) if output.dtype != x.dtype else output
 
+        out_shape = x.shape[:-1] + (int(out_features) if out_features is not None else int(qweight.shape[-1]),)
+        reshaped_x = x_in.reshape(-1, x_in.shape[-1])
         if g_idx is None or (isinstance(g_idx, torch.Tensor) and g_idx.numel() == 0):
             g_idx_t = torch.empty((0,), device=x.device, dtype=torch.int)
         else:
             g_idx_t = g_idx.to(device=x.device, dtype=torch.int)
 
-        out_shape = x.shape[:-1] + (qweight.shape[-1],)
-        reshaped_x = x_in.reshape(-1, x_in.shape[-1])
-
         output = ops.gptq_gemm(
             reshaped_x,
             qweight,
             qzeros,
             scales,
             g_idx_t,
-            True,  # use_exllama (vLLM shuffles weights into exllama-friendly layout)
-            use_v2_format,
-            weight_bits,
+            True,  # use_exllama
+            bool(use_v2_format),
+            int(weight_bits) if weight_bits > 0 else 4,
         )
         if bias is not None:
             output.add_(bias.to(dtype=output.dtype))
diff --git a/diffulex/utils/quantization/strategies/linear_int8_w8a8.py b/diffulex/utils/quantization/strategies/linear_int8_w8a8.py
index 52e92ed..ae62b64 100644
--- a/diffulex/utils/quantization/strategies/linear_int8_w8a8.py
+++ b/diffulex/utils/quantization/strategies/linear_int8_w8a8.py
@@ -22,14 +22,10 @@
 from diffulex.utils.quantization.strategy import LinearQuantizationStrategy
 
 
-def _require_vllm_ops():
-    try:
-        from vllm import _custom_ops as ops  # type: ignore
-    except Exception as e:  # pragma: no cover
-        raise RuntimeError(
-            "W8A8 需要 vLLM 的 CUDA 自定义算子（vllm._custom_ops）。"
-        ) from e
-    return ops
+try:
+    from vllm import _custom_ops as _vllm_ops  # type: ignore
+except Exception:  # pragma: no cover
+    _vllm_ops = None  # type: ignore
 
 
 @register_linear_strategy(weight_dtype="int8", act_dtype="int8")
@@ -42,6 +38,12 @@ def __init__(self) -> None:
         super().__init__()
         # Cache: id(weight) -> (qweight_int8 [N,K], w_scales_fp32 [N])
         self._weight_cache: dict[int, tuple[torch.Tensor, torch.Tensor]] = {}
+        self._ops_available: bool = bool(
+            _vllm_ops is not None
+            and hasattr(torch.ops, "_C")
+            and hasattr(torch.ops._C, "dynamic_scaled_int8_quant")
+            and hasattr(torch.ops._C, "cutlass_scaled_mm")
+        )
 
     @property
     def name(self) -> str:
@@ -109,18 +111,48 @@ def linear_forward(
         bias: Optional[torch.Tensor],
         *,
         quant_kind: str,
-        **kwargs: Any,
+        quant_scales: Optional[torch.Tensor] = None,
+        out_features: Optional[int] = None,
     ) -> torch.Tensor:
         _ = quant_kind
 
-        ops = _require_vllm_ops()
+        # ---- Fast path (decode hot path) ----
+        # Preconditions are strict to minimize Python overhead.
+        # Expect:
+        # - qweight: int8 KxN with stride(0)==1
+        # - w_scales: float32 [1,N], contiguous
+        if (
+            self._ops_available
+            and _vllm_ops is not None
+            and x.dim() == 2
+            and x.device.type == "cuda"
+            and x.dtype in (torch.bfloat16, torch.float16)
+            and x.is_contiguous()
+            and weight is not None
+            and weight.dtype == torch.int8
+            and weight.device == x.device
+            and weight.stride(0) == 1
+            and quant_scales is not None
+            and quant_scales.device == x.device
+            and quant_scales.dtype == torch.float32
+            and quant_scales.dim() == 2
+            and quant_scales.is_contiguous()
+        ):
+            m, _k = x.shape
+            # Optionally validate N to catch wrong metadata early.
+            if out_features is None or int(out_features) == int(quant_scales.shape[1]):
+                x_q = torch.empty((m, _k), device=x.device, dtype=torch.int8)
+                x_s = torch.empty((m, 1), device=x.device, dtype=torch.float32)
+                torch.ops._C.dynamic_scaled_int8_quant(x_q, x, x_s, None)
+                out = torch.empty((m, int(quant_scales.shape[1])), device=x.device, dtype=x.dtype)
+                torch.ops._C.cutlass_scaled_mm(out, x_q, weight, x_s, quant_scales, bias)
+                return out
 
         # If weight already quantized by LinearBase.load-time quantization.
-        quant_scales = kwargs.get("quant_scales", None)
         if weight is not None and weight.dtype == torch.int8 and quant_scales is not None:
-            # Expected: qweight is K×N int8, quant_scales is [1,N] fp32
-            qweight = weight.to(device=x.device)
-            w_scales = quant_scales.to(device=x.device, dtype=torch.float32)
+            # Expected: qweight is K×N int8 (may be non-contiguous), quant_scales is [1,N] fp32
+            qweight = weight
+            w_scales = quant_scales.to(dtype=torch.float32)
         else:
             wid = id(weight)
             cached = self._weight_cache.get(wid)
@@ -138,8 +170,8 @@ def linear_forward(
         if x2.dtype not in (torch.bfloat16, torch.float16):
             x2 = x2.to(torch.bfloat16)
         # dynamic per-token int8 quant + fused GEMM_DQ
-        x_q, x_s, _ = ops.scaled_int8_quant(x2.contiguous(), scale=None, azp=None, symmetric=True)
-        y = ops.cutlass_scaled_mm(
+        x_q, x_s, _ = _vllm_ops.scaled_int8_quant(x2.contiguous(), scale=None, azp=None, symmetric=True)
+        y = _vllm_ops.cutlass_scaled_mm(
             x_q,
             qweight,
             scale_a=x_s,
diff --git a/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py b/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py
index 1cd8eb1..fe99904 100644
--- a/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py
+++ b/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py
@@ -39,7 +39,9 @@ def _allspark_is_available() -> bool:
 def _allspark_w8a16_gemm(*args, **kwargs):
     if _vllm_ops is None or not hasattr(_vllm_ops, "allspark_w8a16_gemm"):
         raise RuntimeError("vLLM custom ops are unavailable: missing `allspark_w8a16_gemm`.")
-    return _vllm_ops.allspark_w8a16_gemm(*args, **kwargs)
+    # Narrow profiler range to isolate Python wrapper overhead vs kernel time.
+    with torch.profiler.record_function("w8a16/allspark_w8a16_gemm(pybind)"):
+        return _vllm_ops.allspark_w8a16_gemm(*args, **kwargs)
 
 
 def _allspark_repack_weight(b_qweight_kn: torch.Tensor, scales_1xn: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
@@ -67,6 +69,11 @@ def __init__(self) -> None:
         super().__init__()
         # Cache for bf16 Parameters only (load-time quantized path bypasses this).
         self._weight_cache: dict[int, tuple[torch.Tensor, torch.Tensor]] = {}
+        # Cache device info and thresholds to reduce per-call CPU overhead.
+        self._sm_info_cache: dict[int, tuple[int, int]] = {}
+        self._cublas_m_thr: int = self._cublas_m_threshold()
+        # One-time availability check (avoid calling `_allspark_is_available()` on every linear).
+        self._allspark_available: bool = _allspark_is_available()
 
     @property
     def name(self) -> str:
@@ -166,8 +173,7 @@ def quantize_weight_for_kernel(
             block_n = 256
         block_n = max(1, block_n)
 
-        use_allspark = _allspark_is_available()
-        if use_allspark:
+        if self._allspark_available:
             # AllSpark repack expects B in (K,N) contiguous layout.
             b_kn = torch.empty((k, n), device=weight.device, dtype=torch.uint8)  # [K,N]
             for i in range(0, n, block_n):
@@ -219,12 +225,22 @@ def quantize_act_for_kernel(
         return x, None
 
     def _get_sm_info(self, device: torch.device) -> tuple[int, int]:
+        # get_device_properties is relatively expensive on hot paths; cache per device index.
+        try:
+            idx = int(device.index) if device.index is not None else int(torch.cuda.current_device())
+        except Exception:
+            idx = -1
+        cached = self._sm_info_cache.get(idx)
+        if cached is not None:
+            return cached
         try:
             props = torch.cuda.get_device_properties(device)
             sm_count = int(getattr(props, "multi_processor_count", 0))
             sm_version = int(props.major) * 10 + int(props.minor)
+            self._sm_info_cache[idx] = (sm_count, sm_version)
             return sm_count, sm_version
         except Exception:
+            self._sm_info_cache[idx] = (0, 0)
             return 0, 0
 
     def _cublas_m_threshold(self) -> int:
@@ -242,39 +258,85 @@ def linear_forward(
         bias: Optional[torch.Tensor],
         *,
         quant_kind: str,
-        **kwargs: Any,
+        quant_scales: Optional[torch.Tensor] = None,
+        out_features: Optional[int] = None,
     ) -> torch.Tensor:
         _ = quant_kind
 
+        # ---- Fast path (decode hot path) ----
+        # Goal: make Python-side overhead close to a single custom-op call (+ optional bias add).
+        # Preconditions are intentionally strict; otherwise we fall back to the fully-checked path.
+        #
+        # Notes:
+        # - We call `_vllm_ops.allspark_w8a16_gemm` directly to avoid extra Python wrapper overhead.
+        # - We require `quant_scales` already in 1xN contiguous layout (LinearBase provides this).
+        if (
+            self._allspark_available
+            and _vllm_ops is not None
+            and x.dim() == 2
+            and x.device.type == "cuda"
+            and x.dtype == torch.bfloat16
+            and x.is_contiguous()
+            and weight is not None
+            and weight.dtype in (torch.uint8, torch.int8)
+            and weight.is_contiguous()
+            and quant_scales is not None
+            and quant_scales.dim() == 2
+            and quant_scales.is_contiguous()
+            and out_features is not None
+        ):
+            # Minimal shape checks (avoid slow/branchy fallback).
+            m, k = x.shape
+            n_32, k_w = weight.shape
+            if k_w == k and (k & 15) == 0 and 0 < int(out_features) <= int(n_32):
+                sm_count, sm_version = self._get_sm_info(x.device)
+                y = _vllm_ops.allspark_w8a16_gemm(
+                    x,
+                    weight,
+                    quant_scales,
+                    None,  # b_qzeros
+                    int(out_features),
+                    -1,  # group_size (only supports -1)
+                    sm_count,
+                    sm_version,
+                    self._cublas_m_thr,
+                    False,  # has_zp
+                    True,  # n32k16_reorder
+                )
+                if bias is not None:
+                    y = y + bias
+                return y
+
         # Handle >2D like torch.nn.functional.linear: flatten then reshape back.
-        orig_shape = x.shape
-        if x.dim() == 1:
-            x2 = x.unsqueeze(0)
-        elif x.dim() == 2:
-            x2 = x
-        else:
-            x2 = x.reshape(-1, x.shape[-1])
+        with torch.profiler.record_function("w8a16/reshape_input"):
+            orig_shape = x.shape
+            if x.dim() == 1:
+                x2 = x.unsqueeze(0)
+            elif x.dim() == 2:
+                x2 = x
+            else:
+                x2 = x.reshape(-1, x.shape[-1])
 
         # Load-time quantized module path: weight is uint8/int8 buffer and scales provided.
-        quant_scales = kwargs.pop("quant_scales", None)
-        if weight is not None and weight.dtype in (torch.uint8, torch.int8):
-            if quant_scales is None:
-                raise ValueError("quant_scales is required when weight is quantized")
-            qweight = weight
-            scales = quant_scales
-        else:
-            # Lazy cache for bf16 weights (not expected in steady-state, but keep for safety).
-            weight_id = id(weight)
-            cached = self._weight_cache.get(weight_id)
-            if cached is None or cached[0].device != x2.device:
-                qweight, scales = self.quantize_weight_for_kernel(weight, device=x2.device)
-                self._weight_cache[weight_id] = (qweight, scales)
+        with torch.profiler.record_function("w8a16/select_qweight_scales"):
+            if weight is not None and weight.dtype in (torch.uint8, torch.int8):
+                if quant_scales is None:
+                    raise ValueError("quant_scales is required when weight is quantized")
+                qweight = weight
+                scales = quant_scales
             else:
-                qweight, scales = cached
+                # Lazy cache for bf16 weights (not expected in steady-state, but keep for safety).
+                weight_id = id(weight)
+                cached = self._weight_cache.get(weight_id)
+                if cached is None or cached[0].device != x2.device:
+                    qweight, scales = self.quantize_weight_for_kernel(weight, device=x2.device)
+                    self._weight_cache[weight_id] = (qweight, scales)
+                else:
+                    qweight, scales = cached
 
         # If fused kernel isn't available, fall back to BF16 only if original weight exists;
         # otherwise fail fast (do NOT dequantize a full matrix, which is memory-prohibitive).
-        if not _allspark_is_available():
+        if not self._allspark_available:
             if weight is not None and getattr(weight, "dtype", None) in (torch.float16, torch.bfloat16):
                 return F.linear(x, weight, bias)
             raise RuntimeError(
@@ -283,56 +345,75 @@ def linear_forward(
             )
 
         # AllSpark kernel requires CUDA and contiguous inputs.
-        if x2.device.type != "cuda":
-            return self._fallback(x, weight, qweight, scales, bias)
+        with torch.profiler.record_function("w8a16/device_dtype_checks"):
+            if x2.device.type != "cuda":
+                return self._fallback(x, weight, qweight, scales, bias)
 
-        if x2.dtype != torch.bfloat16:
-            x2 = x2.to(dtype=torch.bfloat16)
+            if x2.dtype != torch.bfloat16:
+                x2 = x2.to(dtype=torch.bfloat16)
 
         # Shape checks: x2 [M,K], qweight [N_32align,K]
-        m, k = x2.shape
-        n_32, k_w = qweight.shape
-        if k_w != k:
-            return self._fallback(x, weight, qweight, scales, bias)
-        if k % 16 != 0:
-            return self._fallback(x, weight, qweight, scales, bias)
+        with torch.profiler.record_function("w8a16/shape_checks"):
+            m, k = x2.shape
+            n_32, k_w = qweight.shape
+            if k_w != k:
+                return self._fallback(x, weight, qweight, scales, bias)
+            if k % 16 != 0:
+                return self._fallback(x, weight, qweight, scales, bias)
 
         # Recover real N from module bias/metadata if available; default to n_32.
         # In Diffulex, LinearBase stores output_size; but strategy doesn't receive module.
         # So we infer N from bias if present else from scales length (can be N_32align).
-        n = int(bias.numel()) if bias is not None else int(min(scales.numel(), n_32))
-        if n <= 0 or n > n_32:
-            n = n_32
+        with torch.profiler.record_function("w8a16/infer_n_and_sm"):
+            if out_features is not None:
+                n = int(out_features)
+            else:
+                # Backward compatible fallback.
+                n = int(bias.numel()) if bias is not None else int(min(scales.numel(), n_32))
+            if n <= 0 or n > n_32:
+                n = n_32
 
-        sm_count, sm_version = self._get_sm_info(x2.device)
-        cublas_thr = self._cublas_m_threshold()
+            sm_count, sm_version = self._get_sm_info(x2.device)
+            cublas_thr = self._cublas_m_thr
 
         # vLLM allspark expects scales as 1xN (or equivalent contiguous view).
-        scales_1xn = scales.reshape(1, -1).contiguous()
-        y2 = _allspark_w8a16_gemm(
-            x2.contiguous(),
-            qweight.contiguous(),
-            scales_1xn,
-            None,  # b_qzeros
-            n,
-            -1,  # group_size (only supports -1)
-            sm_count,
-            sm_version,
-            cublas_thr,
-            False,  # has_zp
-            True,  # n32k16_reorder
-        )
-        if bias is not None:
-            y2 = y2 + bias
+        # NOTE: reshape/view doesn't allocate; only materialize contiguous copies when needed.
+        with torch.profiler.record_function("w8a16/prepare_contiguous_and_scales"):
+            if not x2.is_contiguous():
+                x2 = x2.contiguous()
+            # qweight/scales are made contiguous at load-time (`LinearBase.set_quantized_weight`)
+            # and by `quantize_weight_for_kernel` return values.
+            if scales.dim() == 2:
+                scales_1xn = scales
+            else:
+                scales_1xn = scales.view(1, -1)
+
+        with torch.profiler.record_function("w8a16/call_fused_gemm"):
+            y2 = _allspark_w8a16_gemm(
+                x2,
+                qweight,
+                scales_1xn,
+                None,  # b_qzeros
+                n,
+                -1,  # group_size (only supports -1)
+                sm_count,
+                sm_version,
+                cublas_thr,
+                False,  # has_zp
+                True,  # n32k16_reorder
+            )
+            if bias is not None:
+                y2 = y2 + bias
 
         # Reshape back
-        if x.dim() == 1:
-            y = y2.squeeze(0)
-        elif x.dim() == 2:
-            y = y2
-        else:
-            y = y2.reshape(*orig_shape[:-1], y2.shape[-1])
-        return y
+        with torch.profiler.record_function("w8a16/reshape_output"):
+            if x.dim() == 1:
+                y = y2.squeeze(0)
+            elif x.dim() == 2:
+                y = y2
+            else:
+                y = y2.reshape(*orig_shape[:-1], y2.shape[-1])
+            return y
 
     # NOTE: We intentionally do not provide a generic dequantize+F.linear fallback for reordered weights.
     # It materializes a full bf16 matrix and is prone to OOM on large models.
diff --git a/diffulex_kernel/python/dllm_flash_attn_kernels.py b/diffulex_kernel/python/dllm_flash_attn_kernels.py
index 8877c49..1535ab0 100644
--- a/diffulex_kernel/python/dllm_flash_attn_kernels.py
+++ b/diffulex_kernel/python/dllm_flash_attn_kernels.py
@@ -1,887 +1,152 @@
-import os
-import torch
-import tilelang
-import tilelang.language as T
-
-from flash_attn import flash_attn_varlen_func
-from tilelang.autotuner import set_autotune_inputs
-
-from diffulex_kernel.python.auto_tuner import build_configs
-from diffulex_kernel.python.kv_cache_kernels import load_kvcache
-from diffulex.attention.metadata import AttnMetaDataBase, is_warming_up
-from test.python.utils.checker import CHECK_FLASH_ATTN_PREFILL, CHECK_FLASH_ATTN_DECODE
-
-
-# from tilelang.engine.callback import register_cuda_postproc_callback
-# @register_cuda_postproc_callback
-# def tilelang_callback_cuda_postproc(code, _):
-#     code = "// tilelang_callback_cuda_postproc: generated CUDA code by TileLang\n" + code
-#     print(code)
-#     return code
-
-
-kernel_config = None
-kernel_config_bf16_q_fp8_kv_decode = None
-
-
-@tilelang.autotune(configs=build_configs())
-@tilelang.jit(
-    # NOTE: Disable TMA and warp specialized for now to avoid compile error on Hopper
-    out_idx=[-1], 
-    pass_configs={
-        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    }
-)
-def dllm_flash_attn_prefill_kernel(
-    NUM_SEQS: int,
-    NUM_GROUPS: int,
-    Q_LEN: int,
-    KV_LEN: int,
-    NUM_HEADS: int,
-    HEAD_DIM: int,
-    IS_BLOCK_ATTN: bool,
-    DIFFUSION_BLOCK_SIZE: int,
-    BLOCK_M: int = 64,
-    BLOCK_N: int = 64,
-    NUM_STAGES: int = 1,
-    NUM_THREADS: int = 128,
-):
-    SCALE = (1.0 / HEAD_DIM)**0.5 * 1.44269504  # log2(e)
-    NUM_KV_HEADS = NUM_HEADS // NUM_GROUPS
-    Q_SHAPE = [Q_LEN, NUM_HEADS, HEAD_DIM]
-    KV_SHAPE = [KV_LEN, NUM_KV_HEADS, HEAD_DIM]
-    O_SHAPE = [Q_LEN, NUM_HEADS, HEAD_DIM]
-    DTYPE = "bfloat16"
-    ACCUM_DTYPE = "float"
-    
-    @T.prim_func
-    def kernel(
-        Q: T.Tensor(Q_SHAPE, DTYPE),
-        K: T.Tensor(KV_SHAPE, DTYPE),
-        V: T.Tensor(KV_SHAPE, DTYPE),
-        cu_seqlens_q: T.Tensor(NUM_SEQS + 1, "int32"),
-        cu_seqlens_k: T.Tensor(NUM_SEQS + 1, "int32"),
-        max_seqlen_q: T.int32,
-        O: T.Tensor(O_SHAPE, DTYPE),
-    ):
-        with T.Kernel(T.ceildiv(max_seqlen_q, BLOCK_M), NUM_HEADS, NUM_SEQS, threads=NUM_THREADS) as (bx, by, bz):
-            Q_shared = T.alloc_shared([BLOCK_M, HEAD_DIM], DTYPE)
-            K_shared = T.alloc_shared([BLOCK_N, HEAD_DIM], DTYPE)
-            V_shared = T.alloc_shared([BLOCK_N, HEAD_DIM], DTYPE)
-            O_shared = T.alloc_shared([BLOCK_M, HEAD_DIM], DTYPE)
-            
-            acc_score = T.alloc_fragment([BLOCK_M, BLOCK_N], ACCUM_DTYPE)
-            acc_score_cast = T.alloc_fragment([BLOCK_M, BLOCK_N], DTYPE)
-            acc_output = T.alloc_fragment([BLOCK_M, HEAD_DIM], ACCUM_DTYPE)
-            scores_max = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE)
-            scores_max_prev = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE)
-            scores_scale = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE)
-            scores_sum = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE)
-            log_sum = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE)
-            
-            T.annotate_layout({
-                Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-            })
-            
-            q_block_idx = bx
-            seq_idx = bz
-            head_idx = by
-            kv_head_idx = head_idx // NUM_GROUPS
-            
-            q_start_idx = cu_seqlens_q[seq_idx]
-            kv_start_idx = cu_seqlens_k[seq_idx]
-            q_end_idx = cu_seqlens_q[seq_idx + 1]
-            kv_end_idx = cu_seqlens_k[seq_idx + 1]
-            
-            cur_q_seqlen = q_end_idx - q_start_idx
-            cur_kv_seqlen = kv_end_idx - kv_start_idx
-            
-            T.copy(Q[q_start_idx + q_block_idx * BLOCK_M : q_start_idx + (q_block_idx + 1) * BLOCK_M, head_idx, :], Q_shared)
-            
-            T.fill(acc_output, 0)
-            T.fill(acc_score, 0)
-            T.fill(log_sum, 0)
-            T.fill(scores_max, -T.infinity(ACCUM_DTYPE))
-            
-            # The same boundary condition as naive causal mask
-            loop_range = (
-                T.min(T.ceildiv(cur_q_seqlen + (q_block_idx + 1) * BLOCK_M, BLOCK_N), T.ceildiv(cur_kv_seqlen, BLOCK_N))
-                if IS_BLOCK_ATTN else T.ceildiv(cur_kv_seqlen, BLOCK_N)
-            )
-            for kv_block_idx in T.Pipelined(loop_range, num_stages=NUM_STAGES):
-                T.copy(K[kv_start_idx + kv_block_idx * BLOCK_N : kv_start_idx + (kv_block_idx + 1) * BLOCK_N, kv_head_idx, :], K_shared)
-                
-                # Initialize acc_score with mask
-                if IS_BLOCK_ATTN:
-                    for i, j in T.Parallel(BLOCK_M, BLOCK_N):
-                        num_diffusion_blocks = (q_block_idx * BLOCK_M + i) // DIFFUSION_BLOCK_SIZE + 1
-                        acc_score[i, j] = T.if_then_else(
-                            (num_diffusion_blocks * DIFFUSION_BLOCK_SIZE <= kv_block_idx * BLOCK_N + j) or
-                            (q_block_idx * BLOCK_M + i >= cur_q_seqlen or 
-                            kv_block_idx * BLOCK_N + j >= cur_kv_seqlen), -1e9, 0
-                        )
-                else:
-                    for i, j in T.Parallel(BLOCK_M, BLOCK_N):
-                        acc_score[i, j] = T.if_then_else(
-                            (q_block_idx * BLOCK_M + i >= cur_q_seqlen or 
-                            kv_block_idx * BLOCK_N + j >= cur_kv_seqlen), -1e9, 0
-                        )
-                        
-                # Compute attention scores
-                T.gemm(Q_shared, K_shared, acc_score, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                
-                # Compute online softmax
-                T.copy(scores_max, scores_max_prev)
-                T.fill(scores_max, -T.infinity(ACCUM_DTYPE))
-                T.reduce_max(acc_score, scores_max, dim=1, clear=False) # T.reduce_max(acc_score, scores_max, dim=1, clear=True) # TODO: check if this is correct
-                for i in T.Parallel(BLOCK_M):
-                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
-                
-                for i in T.parallel(BLOCK_M):
-                    scores_scale[i] = T.exp2(scores_max_prev[i] * SCALE - scores_max[i] * SCALE)
-                    
-                for i, j in T.Parallel(BLOCK_M, BLOCK_N):
-                    acc_score[i, j] = T.exp2(acc_score[i, j] * SCALE - scores_max[i] * SCALE)
-                    
-                T.reduce_sum(acc_score, scores_sum, dim=1)
-                for i in T.Parallel(BLOCK_M):
-                    log_sum[i] = log_sum[i] * scores_scale[i] + scores_sum[i]
-                
-                T.copy(acc_score, acc_score_cast)
-                for i, j in T.Parallel(BLOCK_M, HEAD_DIM):
-                    acc_output[i, j] *= scores_scale[i]
-                
-                # Compute attention output
-                T.copy(V[kv_start_idx + kv_block_idx * BLOCK_N : kv_start_idx + (kv_block_idx + 1) * BLOCK_N, kv_head_idx, :], V_shared)
-                T.gemm(acc_score_cast, V_shared, acc_output, policy=T.GemmWarpPolicy.FullRow)
-            
-            for i, j in T.Parallel(BLOCK_M, HEAD_DIM):
-                acc_output[i, j] /= log_sum[i]
-                
-            T.copy(acc_output, O_shared)
-            for i, d_idx in T.Parallel(BLOCK_M, HEAD_DIM):
-                if i + q_block_idx * BLOCK_M < cur_q_seqlen:
-                    O[i + q_start_idx + q_block_idx * BLOCK_M, head_idx, d_idx] = O_shared[i, d_idx]
-            
-    return kernel
-
-
-@tilelang.jit(
-    # NOTE: Disable TMA and warp specialized for now to avoid compile error on Hopper
-    out_idx=[-1], 
-    pass_configs={
-        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        # tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_ENABLE: True,
-        # tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_FORMATS: "txt,pdf"
-    }
-)
-def dllm_flash_attn_decode_kernel(
-    NUM_SEQS: int,
-    NUM_GROUPS: int,
-    NUM_PAGE_BLOCKS: int,
-    Q_LEN: int,
-    KV_LEN: int,
-    NUM_HEADS: int,
-    HEAD_DIM: int,
-    IS_BLOCK_ATTN: bool,
-    DIFFUSION_BLOCK_SIZE: int,
-    MAX_SEQ_NUM_BLOCKS: int,
-    PAGE_BLOCK_SIZE: int = 32,
-    BLOCK_M: int = 64,
-    BLOCK_N: int = 64,
-    NUM_STAGES: int = 1,
-    NUM_THREADS: int = 128,
-):
-    SCALE = (1.0 / HEAD_DIM)**0.5 * 1.44269504  # log2(e)
-    NUM_KV_HEADS = NUM_HEADS // NUM_GROUPS
-    Q_SHAPE = [Q_LEN, NUM_HEADS, HEAD_DIM]
-    KV_SHAPE = [KV_LEN, NUM_KV_HEADS, HEAD_DIM]
-    O_SHAPE = [Q_LEN, NUM_HEADS, HEAD_DIM]
-    K_CACHE_SHAPE = [NUM_PAGE_BLOCKS, PAGE_BLOCK_SIZE, NUM_KV_HEADS, HEAD_DIM]
-    V_CACHE_SHAPE = [NUM_PAGE_BLOCKS, PAGE_BLOCK_SIZE, NUM_KV_HEADS, HEAD_DIM]
-    MAX_SEQ_NUM_BLOCKS = T.dynamic("MAX_SEQ_NUM_BLOCKS", 'int32')
-    BLOCK_TABLES_SHAPE = [NUM_SEQS, MAX_SEQ_NUM_BLOCKS]
-    DTYPE = "bfloat16"
-    ACCUM_DTYPE = "float32"
-   
-    @T.prim_func
-    def kernel(
-        Q: T.Tensor(Q_SHAPE, DTYPE),
-        K: T.Tensor(KV_SHAPE, DTYPE),
-        V: T.Tensor(KV_SHAPE, DTYPE),
-        K_Cache: T.Tensor(K_CACHE_SHAPE, DTYPE),
-        V_Cache: T.Tensor(V_CACHE_SHAPE, DTYPE),
-        block_tables: T.Tensor(BLOCK_TABLES_SHAPE, "int32"),
-        context_lens: T.Tensor(NUM_SEQS, "int32"),
-        cu_seqlens_q: T.Tensor(NUM_SEQS + 1, "int32"),
-        cu_seqlens_k: T.Tensor(NUM_SEQS + 1, "int32"),
-        max_seqlen_q: T.int32, 
-        O: T.Tensor(O_SHAPE, DTYPE),
-    ):
-        with T.Kernel(NUM_SEQS, NUM_HEADS, threads=NUM_THREADS) as (bx, by):
-            Q_shared = T.alloc_shared([BLOCK_M, HEAD_DIM], DTYPE)
-            K_shared = T.alloc_shared([BLOCK_N, HEAD_DIM], DTYPE)
-            V_shared = T.alloc_shared([BLOCK_N, HEAD_DIM], DTYPE)
-            O_shared = T.alloc_shared([BLOCK_M, HEAD_DIM], DTYPE)
-            K_Cache_shared = T.alloc_shared([PAGE_BLOCK_SIZE, HEAD_DIM], DTYPE)
-            V_Cache_shared = T.alloc_shared([PAGE_BLOCK_SIZE, HEAD_DIM], DTYPE)
-            
-            acc_score_kv = T.alloc_fragment([BLOCK_M, BLOCK_N], ACCUM_DTYPE)
-            acc_score_kv_cast = T.alloc_fragment([BLOCK_M, BLOCK_N], DTYPE)
-            acc_score_kvcache = T.alloc_fragment([BLOCK_M, PAGE_BLOCK_SIZE], ACCUM_DTYPE)
-            acc_score_kvcache_cast = T.alloc_fragment([BLOCK_M, PAGE_BLOCK_SIZE], DTYPE)
-            
-            acc_output = T.alloc_fragment([BLOCK_M, HEAD_DIM], ACCUM_DTYPE)
-            scores_max = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE)
-            scores_max_prev = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE)
-            scores_scale = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE)
-            scores_sum = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE)
-            log_sum = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE)
-            
-            T.annotate_layout({
-                Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-            })
-
-            seq_idx = bx
-            head_idx = by
-            kv_head_idx = head_idx // NUM_GROUPS
-            
-            q_start_idx = cu_seqlens_q[seq_idx]
-            kv_start_idx = cu_seqlens_k[seq_idx]
-            q_end_idx = cu_seqlens_q[seq_idx + 1]
-            kv_end_idx = cu_seqlens_k[seq_idx + 1]
-            
-            cur_q_seqlen = q_end_idx - q_start_idx
-            cur_kv_seqlen = kv_end_idx - kv_start_idx
-            
-            cur_context_len = context_lens[seq_idx]
-            
-            T.copy(Q[q_start_idx : q_start_idx + BLOCK_M, head_idx, :], Q_shared)
-            
-            T.fill(acc_output, 0)
-            T.fill(acc_score_kv, 0)
-            T.fill(acc_score_kvcache, 0)
-            T.fill(log_sum, 0)
-            T.fill(scores_max, -T.infinity(ACCUM_DTYPE))
-            
-            # ==========================
-            # Stage 1: KV Cache Attention (Context)
-            # ==========================
-            for page_block_idx_local in T.Pipelined(MAX_SEQ_NUM_BLOCKS, num_stages=NUM_STAGES):
-                page_block_idx_global = block_tables[seq_idx, page_block_idx_local]
-                
-                if page_block_idx_global >= 0:
-                    T.copy(K_Cache[page_block_idx_global, :, kv_head_idx, :], K_Cache_shared)
-                    
-                    for i, j in T.Parallel(BLOCK_M, PAGE_BLOCK_SIZE):
-                        acc_score_kvcache[i, j] = T.if_then_else(
-                            (i >= cur_q_seqlen or 
-                            page_block_idx_local * PAGE_BLOCK_SIZE + j >= cur_context_len), -1e9, 0
-                        )
-                    
-                    # Compute attention scores
-                    T.gemm(Q_shared, K_Cache_shared, acc_score_kvcache, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                    
-                    # Compute online softmax
-                    T.copy(scores_max, scores_max_prev)
-                    T.fill(scores_max, -T.infinity(ACCUM_DTYPE))
-                    T.reduce_max(acc_score_kvcache, scores_max, dim=1, clear=False)
-                    for i in T.Parallel(BLOCK_M):
-                        scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
-                    
-                    for i in T.Parallel(BLOCK_M):
-                        scores_scale[i] = T.exp2(scores_max_prev[i] * SCALE - scores_max[i] * SCALE)
-                        
-                    for i, j in T.Parallel(BLOCK_M, PAGE_BLOCK_SIZE):
-                        acc_score_kvcache[i, j] = T.exp2(acc_score_kvcache[i, j] * SCALE - scores_max[i] * SCALE)
-                        
-                    T.reduce_sum(acc_score_kvcache, scores_sum, dim=1)
-                    for i in T.Parallel(BLOCK_M):
-                        log_sum[i] = log_sum[i] * scores_scale[i] + scores_sum[i]
-                        
-                    T.copy(acc_score_kvcache, acc_score_kvcache_cast)
-                    
-                    # Scale previous output accumulator
-                    for i, j in T.Parallel(BLOCK_M, HEAD_DIM):
-                        acc_output[i, j] *= scores_scale[i]
-                    
-                    # Accumulate current V_cache contribution
-                    T.copy(V_Cache[page_block_idx_global, :, kv_head_idx, :], V_Cache_shared)
-                    T.gemm(acc_score_kvcache_cast, V_Cache_shared, acc_output, policy=T.GemmWarpPolicy.FullRow)
-                
-            
-            # ==========================
-            # Stage 2: Fresh KV Attention (Self-Attn)
-            # ==========================
-            for idx in T.Pipelined(T.ceildiv(DIFFUSION_BLOCK_SIZE, BLOCK_N), num_stages=NUM_STAGES):
-                T.copy(K[kv_start_idx : kv_start_idx + BLOCK_N, kv_head_idx, :], K_shared)
-
-                for i, j in T.Parallel(BLOCK_M, BLOCK_N):
-                    acc_score_kv[i, j] = T.if_then_else(i >= cur_q_seqlen or j >= cur_kv_seqlen, -1e9, 0)
-                
-                T.gemm(Q_shared, K_shared, acc_score_kv, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                
-                T.copy(scores_max, scores_max_prev)
-                T.fill(scores_max, -T.infinity(ACCUM_DTYPE))
-                T.reduce_max(acc_score_kv, scores_max, dim=1, clear=False)
-                for i in T.Parallel(BLOCK_M):
-                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
-                
-                for i in T.Parallel(BLOCK_M):
-                    scores_scale[i] = T.exp2(scores_max_prev[i] * SCALE - scores_max[i] * SCALE)
-                
-                for i, j in T.Parallel(BLOCK_M, BLOCK_N):
-                    acc_score_kv[i, j] = T.exp2(acc_score_kv[i, j] * SCALE - scores_max[i] * SCALE)
-                    
-                T.reduce_sum(acc_score_kv, scores_sum, dim=1)
-                for i in T.Parallel(BLOCK_M):
-                    log_sum[i] = log_sum[i] * scores_scale[i] + scores_sum[i]
-                    
-                T.copy(acc_score_kv, acc_score_kv_cast)
-                
-                # Scale previous output
-                for i, j in T.Parallel(BLOCK_M, HEAD_DIM):
-                    acc_output[i, j] *= scores_scale[i]
-                
-                T.copy(V[kv_start_idx : kv_start_idx + BLOCK_N, kv_head_idx, :], V_shared)
-                
-                # Accumulate current V contribution
-                T.gemm(acc_score_kv_cast, V_shared, acc_output, policy=T.GemmWarpPolicy.FullRow)
-            
-            
-            # ==========================
-            # Stage 3: Finalize
-            # ==========================
-            for i, j in T.Parallel(BLOCK_M, HEAD_DIM):
-                acc_output[i, j] /= log_sum[i]
+"""
+Diffulex Flash-Attn kernel wrappers.
 
-            T.copy(acc_output, O_shared)
-            for i, d_idx in T.Parallel(BLOCK_M, HEAD_DIM):
-                if i < cur_q_seqlen:
-                    O[i + q_start_idx, head_idx, d_idx] = O_shared[i, d_idx] 
-            
-    return kernel
+Goals:
+- Decode path should NOT require TileLang at import time.
+- Prefill behavior remains unchanged (TileLang for block attention / flash-attn varlen otherwise),
+  but TileLang is imported lazily only when prefill is called.
+"""
 
+from __future__ import annotations
 
-@tilelang.autotune(configs=build_configs())
-@tilelang.jit(
-    # NOTE: Disable TMA and warp specialized for now to avoid compile error on Hopper
-    out_idx=[-1], 
-    pass_configs={
-        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    }
-)
-def dllm_flash_attn_decode_kernel_bf16_q_fp8_kv(
-    NUM_SEQS: int,
-    NUM_GROUPS: int,
-    NUM_PAGE_BLOCKS: int,
-    Q_LEN: int,
-    KV_LEN: int,
-    NUM_HEADS: int,
-    HEAD_DIM: int,
-    IS_BLOCK_ATTN: bool,
-    DIFFUSION_BLOCK_SIZE: int,
-    MAX_SEQ_NUM_BLOCKS: int,
-    PAGE_BLOCK_SIZE: int = 32,
-    BLOCK_M: int = 64,
-    BLOCK_N: int = 64,
-    NUM_STAGES: int = 1,
-    NUM_THREADS: int = 128,
-):
-    SCALE = (1.0 / HEAD_DIM)**0.5 * 1.44269504  # log2(e)
-    NUM_KV_HEADS = NUM_HEADS // NUM_GROUPS
-    Q_SHAPE = [Q_LEN, NUM_HEADS, HEAD_DIM]
-    KV_SHAPE = [KV_LEN, NUM_KV_HEADS, HEAD_DIM]
-    O_SHAPE = [Q_LEN, NUM_HEADS, HEAD_DIM]
-    K_CACHE_SHAPE = [NUM_PAGE_BLOCKS, PAGE_BLOCK_SIZE, NUM_KV_HEADS, HEAD_DIM]
-    V_CACHE_SHAPE = [NUM_PAGE_BLOCKS, PAGE_BLOCK_SIZE, NUM_KV_HEADS, HEAD_DIM]
-    MAX_SEQ_NUM_BLOCKS = T.dynamic("MAX_SEQ_NUM_BLOCKS", 'int32')
-    BLOCK_TABLES_SHAPE = [NUM_SEQS, MAX_SEQ_NUM_BLOCKS]
-    DTYPE = "bfloat16"
-    ACCUM_DTYPE = "float32"
-    FP8_DTYPE = "float8_e4m3fn"
-   
-    @T.prim_func
-    def kernel(
-        Q: T.Tensor(Q_SHAPE, DTYPE),
-        K: T.Tensor(KV_SHAPE, DTYPE),
-        V: T.Tensor(KV_SHAPE, DTYPE),
-        K_Cache: T.Tensor(K_CACHE_SHAPE, FP8_DTYPE),
-        V_Cache: T.Tensor(V_CACHE_SHAPE, FP8_DTYPE),
-        K_Scale: T.Tensor([NUM_KV_HEADS], "float32"),
-        V_Scale: T.Tensor([NUM_KV_HEADS], "float32"),
-        block_tables: T.Tensor(BLOCK_TABLES_SHAPE, "int32"),
-        context_lens: T.Tensor(NUM_SEQS, "int32"),
-        cu_seqlens_q: T.Tensor(NUM_SEQS + 1, "int32"),
-        cu_seqlens_k: T.Tensor(NUM_SEQS + 1, "int32"),
-        max_seqlen_q: T.int32, 
-        O: T.Tensor(O_SHAPE, DTYPE),
-    ):
-        with T.Kernel(NUM_SEQS, NUM_HEADS, threads=NUM_THREADS) as (bx, by):
-            Q_shared = T.alloc_shared([BLOCK_M, HEAD_DIM], DTYPE)
-            K_shared = T.alloc_shared([BLOCK_N, HEAD_DIM], DTYPE)
-            V_shared = T.alloc_shared([BLOCK_N, HEAD_DIM], DTYPE)
-            O_shared = T.alloc_shared([BLOCK_M, HEAD_DIM], DTYPE)
-            
-            # KV cache shared staging buffers (BF16):
-            # HBM(FP8) -> T.copy (implicit cast) -> shared(BF16) -> GEMM
-            K_Cache_shared_bf16 = T.alloc_shared([PAGE_BLOCK_SIZE, HEAD_DIM], DTYPE)
-            V_Cache_shared_bf16 = T.alloc_shared([PAGE_BLOCK_SIZE, HEAD_DIM], DTYPE)
-            
-            acc_score_kv = T.alloc_fragment([BLOCK_M, BLOCK_N], ACCUM_DTYPE)
-            acc_score_kv_cast = T.alloc_fragment([BLOCK_M, BLOCK_N], DTYPE)
-            acc_score_kvcache = T.alloc_fragment([BLOCK_M, PAGE_BLOCK_SIZE], ACCUM_DTYPE)
-            acc_score_kvcache_cast = T.alloc_fragment([BLOCK_M, PAGE_BLOCK_SIZE], DTYPE)
-            
-            acc_output = T.alloc_fragment([BLOCK_M, HEAD_DIM], ACCUM_DTYPE)
-            scores_max = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE)
-            scores_max_prev = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE)
-            scores_scale = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE)
-            scores_sum = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE)
-            log_sum = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE)
-            
-            T.annotate_layout({
-                Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-            })
-
-            seq_idx = bx
-            head_idx = by
-            kv_head_idx = head_idx // NUM_GROUPS
-            
-            q_start_idx = cu_seqlens_q[seq_idx]
-            kv_start_idx = cu_seqlens_k[seq_idx]
-            q_end_idx = cu_seqlens_q[seq_idx + 1]
-            kv_end_idx = cu_seqlens_k[seq_idx + 1]
-            
-            cur_q_seqlen = q_end_idx - q_start_idx
-            cur_kv_seqlen = kv_end_idx - kv_start_idx
-            
-            cur_context_len = context_lens[seq_idx]
-            
-            T.copy(Q[q_start_idx : q_start_idx + BLOCK_M, head_idx, :], Q_shared)
-            
-            T.fill(acc_output, 0)
-            T.fill(acc_score_kv, 0)
-            T.fill(acc_score_kvcache, 0)
-            T.fill(log_sum, 0)
-            T.fill(scores_max, -T.infinity(ACCUM_DTYPE))
-            
-            # ==========================
-            # Stage 1: KV Cache Attention (Context)
-            # ==========================
-            for page_block_idx_local in T.Pipelined(MAX_SEQ_NUM_BLOCKS, num_stages=NUM_STAGES):
-                page_block_idx_global = block_tables[seq_idx, page_block_idx_local]
-                if page_block_idx_global >= 0:
-                    # Step 1: Load FP8 K_Cache, implicit cast to BF16 (vectorized path).
-                    # K_Scale will be applied on scores (much cheaper than scaling K elementwise).
-                    T.copy(K_Cache[page_block_idx_global, :, kv_head_idx, :], K_Cache_shared_bf16)
-                    
-                    # Initialize scores with mask, then GEMM accumulates into it (masked entries remain ~-1e9).
-                    for i, j in T.Parallel(BLOCK_M, PAGE_BLOCK_SIZE):
-                        acc_score_kvcache[i, j] = T.if_then_else(
-                            (i >= cur_q_seqlen or page_block_idx_local * PAGE_BLOCK_SIZE + j >= cur_context_len),
-                            -1e9,
-                            0,
-                        )
-                    
-                    # Compute attention scores
-                    T.gemm(Q_shared, K_Cache_shared_bf16, acc_score_kvcache, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                    # Apply per-head K scale on scores: (Q·(K*ks)) == (Q·K) * ks
-                    for i, j in T.Parallel(BLOCK_M, PAGE_BLOCK_SIZE):
-                        acc_score_kvcache[i, j] *= K_Scale[kv_head_idx]
-                    
-                    # Compute online softmax
-                    T.copy(scores_max, scores_max_prev)
-                    T.fill(scores_max, -T.infinity(ACCUM_DTYPE))
-                    T.reduce_max(acc_score_kvcache, scores_max, dim=1, clear=False)
-                    for i in T.Parallel(BLOCK_M):
-                        scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
-                    
-                    for i in T.Parallel(BLOCK_M):
-                        scores_scale[i] = T.exp2(scores_max_prev[i] * SCALE - scores_max[i] * SCALE)
-                        
-                    for i, j in T.Parallel(BLOCK_M, PAGE_BLOCK_SIZE):
-                        acc_score_kvcache[i, j] = T.exp2(acc_score_kvcache[i, j] * SCALE - scores_max[i] * SCALE)
-                        
-                    T.reduce_sum(acc_score_kvcache, scores_sum, dim=1)
-                    for i in T.Parallel(BLOCK_M):
-                        log_sum[i] = log_sum[i] * scores_scale[i] + scores_sum[i]
-                        
-                    # Cast weights to BF16 for V GEMM, fuse per-head V scale here:
-                    # (softmax * (V*vs)) == ((softmax*vs) · V)
-                    # Use separate loop to avoid layout infer conflict
-                    for i, j in T.Parallel(BLOCK_M, PAGE_BLOCK_SIZE):
-                        acc_score_kvcache_cast[i, j] = (acc_score_kvcache[i, j] * V_Scale[kv_head_idx]).astype(T.bfloat16)
-                    
-                    # Scale previous output accumulator
-                    for i, j in T.Parallel(BLOCK_M, HEAD_DIM):
-                        acc_output[i, j] *= scores_scale[i]
-                    
-                    # Step 2: Load FP8 V_Cache, implicit cast to BF16 (vectorized path).
-                    T.copy(V_Cache[page_block_idx_global, :, kv_head_idx, :], V_Cache_shared_bf16)
-                    
-                    # Accumulate current V_cache contribution using BF16 V_Cache shared buffer
-                    T.gemm(acc_score_kvcache_cast, V_Cache_shared_bf16, acc_output, policy=T.GemmWarpPolicy.FullRow)
-                
-                if page_block_idx_local == MAX_SEQ_NUM_BLOCKS - 1:
-                    # ==========================
-                    # Stage 2: Fresh KV Attention (Self-Attn)
-                    # ==========================
-                    for idx in T.Pipelined(T.ceildiv(DIFFUSION_BLOCK_SIZE, BLOCK_N), num_stages=NUM_STAGES):
-                        T.copy(K[kv_start_idx : kv_start_idx + BLOCK_N, kv_head_idx, :], K_shared)
+import os
 
-                        for i, j in T.Parallel(BLOCK_M, BLOCK_N):
-                            acc_score_kv[i, j] = T.if_then_else(i >= cur_q_seqlen or j >= cur_kv_seqlen, -1e9, 0)
-                        
-                        T.gemm(Q_shared, K_shared, acc_score_kv, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                        
-                        T.copy(scores_max, scores_max_prev)
-                        T.fill(scores_max, -T.infinity(ACCUM_DTYPE))
-                        T.reduce_max(acc_score_kv, scores_max, dim=1, clear=False)
-                        for i in T.Parallel(BLOCK_M):
-                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
-                        
-                        for i in T.Parallel(BLOCK_M):
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * SCALE - scores_max[i] * SCALE)
-                        
-                        for i, j in T.Parallel(BLOCK_M, BLOCK_N):
-                            acc_score_kv[i, j] = T.exp2(acc_score_kv[i, j] * SCALE - scores_max[i] * SCALE)
-                            
-                        T.reduce_sum(acc_score_kv, scores_sum, dim=1)
-                        for i in T.Parallel(BLOCK_M):
-                            log_sum[i] = log_sum[i] * scores_scale[i] + scores_sum[i]
-                            
-                        T.copy(acc_score_kv, acc_score_kv_cast)
-                        
-                        # Scale previous output
-                        for i, j in T.Parallel(BLOCK_M, HEAD_DIM):
-                            acc_output[i, j] *= scores_scale[i]
-                        
-                        T.copy(V[kv_start_idx : kv_start_idx + BLOCK_N, kv_head_idx, :], V_shared)
-                        
-                        # Accumulate current V contribution
-                        T.gemm(acc_score_kv_cast, V_shared, acc_output, policy=T.GemmWarpPolicy.FullRow)
-            
-            # ==========================
-            # Stage 3: Finalize
-            # ==========================
-            for i, j in T.Parallel(BLOCK_M, HEAD_DIM):
-                acc_output[i, j] /= log_sum[i]
+import torch
+from flash_attn import flash_attn_varlen_func
 
-            T.copy(acc_output, O_shared)
-            for i, d_idx in T.Parallel(BLOCK_M, HEAD_DIM):
-                if i < cur_q_seqlen:
-                    O[i + q_start_idx, head_idx, d_idx] = O_shared[i, d_idx] 
-            
-    return kernel
+from diffulex.attention.metadata import AttnMetaDataBase
+from diffulex_kernel.python.kv_cache_kernels import load_kvcache
+from diffulex_kernel.python.paged_attn_decode_triton import paged_attn_decode_unified_triton
 
 
-def _dllm_flash_attn_prefill_bf16(
+def dllm_flash_attn_prefill(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
     scale: float,
-    attn_metadata: AttnMetaDataBase
+    attn_metadata: AttnMetaDataBase,
 ) -> torch.Tensor:
-    if attn_metadata.attn_type == "full_attention":
-        return flash_attn_varlen_func(
-            q, k, v, 
-            attn_metadata.cu_seqlens_q, attn_metadata.cu_seqlens_k,
-            attn_metadata.max_seqlen_q, attn_metadata.max_seqlen_k,
-            softmax_scale=scale, block_table=None
-        )
-    elif attn_metadata.attn_type == "block_attention":
-        if is_warming_up():
-            global kernel_config
-            with set_autotune_inputs([
-                q, k, v,
-                attn_metadata.cu_seqlens_q,
-                attn_metadata.cu_seqlens_k,
-                attn_metadata.max_seqlen_q,
-            ]):
-                prefill_kernel = dllm_flash_attn_prefill_kernel(
-                    attn_metadata.num_seqs,
-                    q.shape[1] // k.shape[1],
-                    q.shape[0],
-                    k.shape[0],
-                    q.shape[1],
-                    q.shape[2],
-                    attn_metadata.attn_type == "block_attention",
-                    attn_metadata.diffusion_block_size
-                )
-            kernel_config = prefill_kernel.config
-            # CHECK_FLASH_ATTN_PREFILL(
-            #     q, k, v, 
-            #     attn_metadata.cu_seqlens_q, 
-            #     attn_metadata.cu_seqlens_k, 
-            #     attn_metadata.max_seqlen_q, 
-            #     prefill_kernel,
-            #     diffusion_block_size=attn_metadata.diffusion_block_size,
-            #     is_block_attn=(attn_metadata.attn_type == "block_attention"),
-            # )
-            return prefill_kernel(
-                q, k, v, 
-                attn_metadata.cu_seqlens_q, 
-                attn_metadata.cu_seqlens_k, 
-                attn_metadata.max_seqlen_q, 
-            )
-        else:
-            prefill_kernel = dllm_flash_attn_prefill_kernel(
-                attn_metadata.num_seqs,
-                q.shape[1] // k.shape[1],
-                q.shape[0],
-                k.shape[0],
-                q.shape[1],
-                q.shape[2],
-                attn_metadata.attn_type == "block_attention",
-                attn_metadata.diffusion_block_size,
-                **kernel_config
-            )
-            return prefill_kernel(
-                q, k, v, 
-                attn_metadata.cu_seqlens_q, 
-                attn_metadata.cu_seqlens_k, 
-                attn_metadata.max_seqlen_q, 
-            )
-            
+    """
+    Prefill attention wrapper.
 
-def _dllm_flash_attn_decode_bf16(
+    TileLang is imported lazily so decode-only usage does not depend on TileLang.
+    """
+    from diffulex_kernel.python.dllm_flash_attn_prefill_tilelang import (
+        dllm_flash_attn_prefill_tilelang,
+    )
+
+    return dllm_flash_attn_prefill_tilelang(q, k, v, scale, attn_metadata)
+
+
+def _decode_varlen(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
     k_cache: torch.Tensor,
     v_cache: torch.Tensor,
     scale: float,
-    attn_metadata: AttnMetaDataBase
+    attn_metadata: AttnMetaDataBase,
 ) -> torch.Tensor:
-    if attn_metadata.decode_mode == "static":
-        # Use kernel_config from prefill if available, otherwise use empty dict
-        config_kwargs = kernel_config if kernel_config is not None else {}
-        decode_kernel = dllm_flash_attn_decode_kernel(
-            attn_metadata.num_seqs,
-            q.shape[1] // k.shape[1],
-            k_cache.shape[0],
-            q.shape[0],
-            k.shape[0],
-            q.shape[1],
-            q.shape[2],
-            attn_metadata.attn_type == "block_attention",
-            attn_metadata.diffusion_block_size,
-            attn_metadata.block_tables.shape[1],
-            attn_metadata.page_block_size,
-            **config_kwargs
+    """
+    Varlen decode path:
+    - gather/dequant KV cache with Triton `load_kvcache`
+    - run `flash_attn_varlen_func`
+    """
+    do_profile = os.getenv("DIFFULEX_PROFILE_KVCACHE", "0") == "1"
+    if do_profile and q.is_cuda:
+        e0, e1, e2 = (
+            torch.cuda.Event(enable_timing=True),
+            torch.cuda.Event(enable_timing=True),
+            torch.cuda.Event(enable_timing=True),
         )
-        if not is_warming_up():
-            CHECK_FLASH_ATTN_DECODE(
-                q, k, v,
-                k_cache, v_cache,
-                attn_metadata.block_tables,
-                attn_metadata.context_lens,
-                attn_metadata.cu_seqlens_q,
-                attn_metadata.cu_seqlens_k,
-                attn_metadata.max_seqlen_q,
-                decode_kernel,
-                scale=scale,
-                num_groups=q.shape[1] // k.shape[1],
-                page_block_size=attn_metadata.page_block_size,
-                diffusion_block_size=attn_metadata.diffusion_block_size,
-                is_block_attn=(attn_metadata.attn_type == "block_attention"),
-            )
-        
-        return decode_kernel(
-            q, k, v, k_cache, v_cache,
-            attn_metadata.block_tables,
-            attn_metadata.context_lens,
+        e0.record()
+        k_comb, v_comb = load_kvcache(k_cache, v_cache, attn_metadata, k, v)
+        e1.record()
+        out = flash_attn_varlen_func(
+            q,
+            k_comb,
+            v_comb,
             attn_metadata.cu_seqlens_q,
             attn_metadata.cu_seqlens_k,
             attn_metadata.max_seqlen_q,
+            attn_metadata.max_seqlen_k,
+            softmax_scale=scale,
+            block_table=None,
         )
-    elif attn_metadata.decode_mode == "varlen":
-        do_profile = os.getenv("DIFFULEX_PROFILE_KVCACHE", "0") == "1"
-        if do_profile and q.is_cuda:
-            e0, e1, e2 = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
-            e0.record()
-            k_comb, v_comb = load_kvcache(k_cache, v_cache, attn_metadata, k, v)
-            e1.record()
-            out = flash_attn_varlen_func(
-                q, k_comb, v_comb,
-                attn_metadata.cu_seqlens_q, attn_metadata.cu_seqlens_k,
-                attn_metadata.max_seqlen_q, attn_metadata.max_seqlen_k,
-                softmax_scale=scale, block_table=None
-            )
-            e2.record()
-            e2.synchronize()
-            print(
-                f"[DIFFULEX_PROFILE_KVCACHE] decode(varlen,bf16kv) "
-                f"load_kvcache={e0.elapsed_time(e1):.3f}ms flash_attn={e1.elapsed_time(e2):.3f}ms"
-            )
-            return out
-        else:
-            k_comb, v_comb = load_kvcache(k_cache, v_cache, attn_metadata, k, v)
-            return flash_attn_varlen_func(
-                q, k_comb, v_comb,
-                attn_metadata.cu_seqlens_q, attn_metadata.cu_seqlens_k,
-                attn_metadata.max_seqlen_q, attn_metadata.max_seqlen_k,
-                softmax_scale=scale, block_table=None
-            )
+        e2.record()
+        e2.synchronize()
+        print(
+            f"[DIFFULEX_PROFILE_KVCACHE] decode(varlen) "
+            f"load_kvcache={e0.elapsed_time(e1):.3f}ms flash_attn={e1.elapsed_time(e2):.3f}ms"
+        )
+        return out
+
+    k_comb, v_comb = load_kvcache(k_cache, v_cache, attn_metadata, k, v)
+    return flash_attn_varlen_func(
+        q,
+        k_comb,
+        v_comb,
+        attn_metadata.cu_seqlens_q,
+        attn_metadata.cu_seqlens_k,
+        attn_metadata.max_seqlen_q,
+        attn_metadata.max_seqlen_k,
+        softmax_scale=scale,
+        block_table=None,
+    )
 
 
-def _dllm_flash_attn_decode_bf16_q_fp8_kv(
+def _decode_static_unified_triton_bf16(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
     k_cache: torch.Tensor,
     v_cache: torch.Tensor,
     scale: float,
-    attn_metadata: AttnMetaDataBase
+    attn_metadata: AttnMetaDataBase,
 ) -> torch.Tensor:
-    """BF16 Q + FP8 KV decode helper function that uses BF16-Q/FP8-KV kernel with internal dequantization."""
-    if attn_metadata.k_scale is None or attn_metadata.v_scale is None:
-        raise ValueError("FP8 KV decode requires k_scale and v_scale in metadata")
-
-    # KV cache is stored as uint8 for FP8, but TileLang expects float8 view dtype.
-    from diffulex.utils.quantization.context import get_kv_cache_strategy
-    strategy = get_kv_cache_strategy()
-    if strategy is None or getattr(strategy, "kv_cache_format", "bf16") != "fp8":
-        raise ValueError(f"Expected kv_cache_format='fp8', got strategy={type(strategy)}")
-    k_cache = strategy.view_kv_cache_for_kernels(k_cache)
-    v_cache = strategy.view_kv_cache_for_kernels(v_cache)
-    
-    if attn_metadata.decode_mode == "static":
-        global kernel_config_bf16_q_fp8_kv_decode
-        common_args = (
-            attn_metadata.num_seqs,
-            q.shape[1] // k.shape[1],
-            k_cache.shape[0],
-            q.shape[0],
-            k.shape[0],
-            q.shape[1],
-            q.shape[2],
-            attn_metadata.attn_type == "block_attention",
-            attn_metadata.diffusion_block_size,
-            attn_metadata.block_tables.shape[1],
-            attn_metadata.page_block_size,
-        )
-
-        # BF16-Q/FP8-KV decode needs its own autotuned config; do not reuse prefill/BF16 config.
-        # In some environments, TileLang autotuning may fail (e.g. no valid configs compile/validate).
-        # In that case, fall back to the varlen path (Python dequant + flash-attn varlen) for correctness.
-        try:
-            if is_warming_up() or kernel_config_bf16_q_fp8_kv_decode is None:
-                with set_autotune_inputs([
-                    q, k, v,
-                    k_cache, v_cache,
-                    attn_metadata.k_scale,
-                    attn_metadata.v_scale,
-                    attn_metadata.block_tables,
-                    attn_metadata.context_lens,
-                    attn_metadata.cu_seqlens_q,
-                    attn_metadata.cu_seqlens_k,
-                    attn_metadata.max_seqlen_q,
-                ]):
-                    decode_kernel = dllm_flash_attn_decode_kernel_bf16_q_fp8_kv(*common_args)
-                kernel_config_bf16_q_fp8_kv_decode = decode_kernel.config
-            else:
-                decode_kernel = dllm_flash_attn_decode_kernel_bf16_q_fp8_kv(
-                    *common_args,
-                    **kernel_config_bf16_q_fp8_kv_decode,
-                )
-
-            return decode_kernel(
-                q, k, v, k_cache, v_cache,
-                attn_metadata.k_scale,  # Pass K scale
-                attn_metadata.v_scale,  # Pass V scale
-                attn_metadata.block_tables,
-                attn_metadata.context_lens,
-                attn_metadata.cu_seqlens_q,
-                attn_metadata.cu_seqlens_k,
-                attn_metadata.max_seqlen_q,
-            )
-        except RuntimeError as e:
-            # Fall back if autotuning or runtime validation fails.
-            if "Auto-tuning failed" in str(e) or "No configuration" in str(e):
-                k_comb, v_comb = load_kvcache(k_cache, v_cache, attn_metadata, k, v)
-                return flash_attn_varlen_func(
-                    q, k_comb, v_comb,
-                    attn_metadata.cu_seqlens_q, attn_metadata.cu_seqlens_k,
-                    attn_metadata.max_seqlen_q, attn_metadata.max_seqlen_k,
-                    softmax_scale=scale, block_table=None
-                )
-            raise
-    elif attn_metadata.decode_mode == "varlen":
-        # varlen模式使用load_kvcache：FP8 反量化/scale 融合应在 load_kvcache 内完成（Triton fused kernel）
-        do_profile = os.getenv("DIFFULEX_PROFILE_KVCACHE", "0") == "1"
-        if do_profile and q.is_cuda:
-            e0, e1, e2 = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
-            e0.record()
-            k_comb, v_comb = load_kvcache(k_cache, v_cache, attn_metadata, k, v)
-            e1.record()
-            out = flash_attn_varlen_func(
-                q, k_comb, v_comb,
-                attn_metadata.cu_seqlens_q, attn_metadata.cu_seqlens_k,
-                attn_metadata.max_seqlen_q, attn_metadata.max_seqlen_k,
-                softmax_scale=scale, block_table=None
-            )
-            e2.record()
-            e2.synchronize()
-            print(
-                f"[DIFFULEX_PROFILE_KVCACHE] decode(varlen,fp8kv) "
-                f"load_kvcache={e0.elapsed_time(e1):.3f}ms flash_attn={e1.elapsed_time(e2):.3f}ms"
-            )
-            return out
-        else:
-            k_comb, v_comb = load_kvcache(k_cache, v_cache, attn_metadata, k, v)
-            return flash_attn_varlen_func(
-                q, k_comb, v_comb,
-                attn_metadata.cu_seqlens_q, attn_metadata.cu_seqlens_k,
-                attn_metadata.max_seqlen_q, attn_metadata.max_seqlen_k,
-                softmax_scale=scale, block_table=None
-            )
-    else:
-        raise ValueError(f"Unsupported decode mode: {attn_metadata.decode_mode}")
+    return paged_attn_decode_unified_triton(
+        q,
+        k,
+        v,
+        k_cache,
+        v_cache,
+        attn_metadata,
+        softmax_scale=scale,
+        fp8_cache=False,
+    )
 
 
-def dllm_flash_attn_prefill(
+def _decode_static_unified_triton_fp8_cache(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
     scale: float,
-    attn_metadata: AttnMetaDataBase
+    attn_metadata: AttnMetaDataBase,
 ) -> torch.Tensor:
-    """
-    Prefill attention wrapper that dynamically selects kernel based on quantization strategy.
-    
-    Args:
-        q: Query tensor [Q_LEN, NUM_HEADS, HEAD_DIM]
-        k: Key tensor [KV_LEN, NUM_KV_HEADS, HEAD_DIM]
-        v: Value tensor [KV_LEN, NUM_KV_HEADS, HEAD_DIM]
-        scale: Attention scale factor
-        attn_metadata: Attention metadata
-    
-    Returns:
-        Output tensor [Q_LEN, NUM_HEADS, HEAD_DIM]
-    """
+    if attn_metadata.k_scale is None or attn_metadata.v_scale is None:
+        raise ValueError("FP8 KV decode requires k_scale and v_scale in metadata")
+
+    # KV cache is stored as uint8 for FP8, but Triton expects float8 view dtype.
     from diffulex.utils.quantization.context import get_kv_cache_strategy
-    kv_strategy = get_kv_cache_strategy()
-    kv_fmt = getattr(kv_strategy, "kv_cache_format", "bf16") if kv_strategy is not None else "bf16"
 
-    # Q always uses BF16 (attn_q quantization is not supported)
-    q_fmt = "bf16"
+    strategy = get_kv_cache_strategy()
+    if strategy is None or getattr(strategy, "kv_cache_format", "bf16") != "fp8":
+        raise ValueError(f"Expected kv_cache_format='fp8', got strategy={type(strategy)}")
 
-    # Prefill currently uses BF16 kernels for all formats (FP8 prefill kernel TBD).
-    if q_fmt == "bf16" and kv_fmt in ("bf16", "fp8"):
-        return _dllm_flash_attn_prefill_bf16(q, k, v, scale, attn_metadata)
-    raise ValueError(
-        f"Unsupported q_format={q_fmt!r} / kv_cache_format={kv_fmt!r} for prefill"
+    k_cache_fp8 = strategy.view_kv_cache_for_kernels(k_cache)
+    v_cache_fp8 = strategy.view_kv_cache_for_kernels(v_cache)
+
+    return paged_attn_decode_unified_triton(
+        q,
+        k,
+        v,
+        k_cache_fp8,
+        v_cache_fp8,
+        attn_metadata,
+        softmax_scale=scale,
+        fp8_cache=True,
     )
 
 
@@ -892,40 +157,37 @@ def dllm_flash_attn_decode(
     k_cache: torch.Tensor,
     v_cache: torch.Tensor,
     scale: float,
-    attn_metadata: AttnMetaDataBase
+    attn_metadata: AttnMetaDataBase,
 ) -> torch.Tensor:
     """
-    Decode attention wrapper that dynamically selects kernel based on quantization strategy.
-    
-    Args:
-        q: Query tensor [Q_LEN, NUM_HEADS, HEAD_DIM]
-        k: Key tensor [KV_LEN, NUM_KV_HEADS, HEAD_DIM]
-        v: Value tensor [KV_LEN, NUM_KV_HEADS, HEAD_DIM]
-        k_cache: Key cache tensor (shape depends on layout)
-        v_cache: Value cache tensor (shape depends on layout)
-        scale: Attention scale factor
-        attn_metadata: Attention metadata
-    
-    Returns:
-        Output tensor [Q_LEN, NUM_HEADS, HEAD_DIM]
-    
-    Note:
-        For FP8 strategy:
-        - Unified layout static mode: dequantization + scale fusion are handled inside the TileLang FP8 decode kernel
-        - Unified layout varlen mode: dequantization is handled by load_kvcache (Python path)
-        - Distinct layout: dequantization is handled by load_kvcache (Python path)
+    Decode attention wrapper:
+    - static: Triton paged-attention over (paged) KV cache + current-step KV
+    - varlen: load_kvcache (Triton gather/dequant) + flash-attn varlen
     """
     from diffulex.utils.quantization.context import get_kv_cache_strategy
+
     kv_strategy = get_kv_cache_strategy()
     kv_fmt = getattr(kv_strategy, "kv_cache_format", "bf16") if kv_strategy is not None else "bf16"
 
-    # Q always uses BF16 (attn_q quantization is not supported)
-    q_fmt = "bf16"
+    decode_mode = getattr(attn_metadata, "decode_mode", "varlen")
+    if decode_mode == "static":
+        # Only unified layout is supported in static paged-attention for now.
+        if getattr(attn_metadata, "kv_cache_layout", "unified") != "unified":
+            return _decode_varlen(q, k, v, k_cache, v_cache, scale, attn_metadata)
+
+        if kv_fmt == "bf16":
+            return _decode_static_unified_triton_bf16(q, k, v, k_cache, v_cache, scale, attn_metadata)
+        if kv_fmt == "fp8":
+            return _decode_static_unified_triton_fp8_cache(q, k, v, k_cache, v_cache, scale, attn_metadata)
+        raise ValueError(f"Unsupported kv_cache_format={kv_fmt!r} for static decode")
+
+    if decode_mode == "varlen":
+        return _decode_varlen(q, k, v, k_cache, v_cache, scale, attn_metadata)
+
+    raise ValueError(f"Unsupported decode mode: {decode_mode!r}")
+
 
-    if q_fmt == "bf16" and kv_fmt == "bf16":
-        return _dllm_flash_attn_decode_bf16(q, k, v, k_cache, v_cache, scale, attn_metadata)
-    if q_fmt == "bf16" and kv_fmt == "fp8":
-        return _dllm_flash_attn_decode_bf16_q_fp8_kv(q, k, v, k_cache, v_cache, scale, attn_metadata)
-    raise ValueError(
-        f"Unsupported q_format={q_fmt!r} / kv_cache_format={kv_fmt!r} for decode"
-    )
\ No newline at end of file
+__all__ = [
+    "dllm_flash_attn_prefill",
+    "dllm_flash_attn_decode",
+]
diff --git a/diffulex_kernel/python/dllm_flash_attn_prefill_tilelang.py b/diffulex_kernel/python/dllm_flash_attn_prefill_tilelang.py
new file mode 100644
index 0000000..17dfaf9
--- /dev/null
+++ b/diffulex_kernel/python/dllm_flash_attn_prefill_tilelang.py
@@ -0,0 +1,250 @@
+import torch
+import tilelang
+import tilelang.language as T
+
+from flash_attn import flash_attn_varlen_func
+from tilelang.autotuner import set_autotune_inputs
+
+from diffulex_kernel.python.auto_tuner import build_configs
+from diffulex.attention.metadata import AttnMetaDataBase, is_warming_up
+
+
+kernel_config = None
+
+
+@tilelang.autotune(configs=build_configs())
+@tilelang.jit(
+    # NOTE: Disable TMA and warp specialized for now to avoid compile error on Hopper
+    out_idx=[-1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    },
+)
+def dllm_flash_attn_prefill_kernel(
+    NUM_SEQS: int,
+    NUM_GROUPS: int,
+    Q_LEN: int,
+    KV_LEN: int,
+    NUM_HEADS: int,
+    HEAD_DIM: int,
+    IS_BLOCK_ATTN: bool,
+    DIFFUSION_BLOCK_SIZE: int,
+    BLOCK_M: int = 64,
+    BLOCK_N: int = 64,
+    NUM_STAGES: int = 1,
+    NUM_THREADS: int = 128,
+):
+    SCALE = (1.0 / HEAD_DIM) ** 0.5 * 1.44269504  # log2(e)
+    NUM_KV_HEADS = NUM_HEADS // NUM_GROUPS
+    Q_SHAPE = [Q_LEN, NUM_HEADS, HEAD_DIM]
+    KV_SHAPE = [KV_LEN, NUM_KV_HEADS, HEAD_DIM]
+    O_SHAPE = [Q_LEN, NUM_HEADS, HEAD_DIM]
+    DTYPE = "bfloat16"
+    ACCUM_DTYPE = "float"
+
+    @T.prim_func
+    def kernel(
+        Q: T.Tensor(Q_SHAPE, DTYPE),
+        K: T.Tensor(KV_SHAPE, DTYPE),
+        V: T.Tensor(KV_SHAPE, DTYPE),
+        cu_seqlens_q: T.Tensor(NUM_SEQS + 1, "int32"),
+        cu_seqlens_k: T.Tensor(NUM_SEQS + 1, "int32"),
+        max_seqlen_q: T.int32,
+        O: T.Tensor(O_SHAPE, DTYPE),
+    ):
+        with T.Kernel(T.ceildiv(max_seqlen_q, BLOCK_M), NUM_HEADS, NUM_SEQS, threads=NUM_THREADS) as (bx, by, bz):
+            Q_shared = T.alloc_shared([BLOCK_M, HEAD_DIM], DTYPE)
+            K_shared = T.alloc_shared([BLOCK_N, HEAD_DIM], DTYPE)
+            V_shared = T.alloc_shared([BLOCK_N, HEAD_DIM], DTYPE)
+            O_shared = T.alloc_shared([BLOCK_M, HEAD_DIM], DTYPE)
+
+            acc_score = T.alloc_fragment([BLOCK_M, BLOCK_N], ACCUM_DTYPE)
+            acc_score_cast = T.alloc_fragment([BLOCK_M, BLOCK_N], DTYPE)
+            acc_output = T.alloc_fragment([BLOCK_M, HEAD_DIM], ACCUM_DTYPE)
+            scores_max = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE)
+            scores_max_prev = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE)
+            scores_scale = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE)
+            scores_sum = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE)
+            log_sum = T.alloc_fragment([BLOCK_M], ACCUM_DTYPE)
+
+            T.annotate_layout(
+                {
+                    Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
+                    O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+                }
+            )
+
+            q_block_idx = bx
+            seq_idx = bz
+            head_idx = by
+            kv_head_idx = head_idx // NUM_GROUPS
+
+            q_start_idx = cu_seqlens_q[seq_idx]
+            kv_start_idx = cu_seqlens_k[seq_idx]
+            q_end_idx = cu_seqlens_q[seq_idx + 1]
+            kv_end_idx = cu_seqlens_k[seq_idx + 1]
+
+            cur_q_seqlen = q_end_idx - q_start_idx
+            cur_kv_seqlen = kv_end_idx - kv_start_idx
+
+            T.copy(
+                Q[q_start_idx + q_block_idx * BLOCK_M : q_start_idx + (q_block_idx + 1) * BLOCK_M, head_idx, :],
+                Q_shared,
+            )
+
+            T.fill(acc_output, 0)
+            T.fill(acc_score, 0)
+            T.fill(log_sum, 0)
+            T.fill(scores_max, -T.infinity(ACCUM_DTYPE))
+
+            loop_range = (
+                T.min(
+                    T.ceildiv(cur_q_seqlen + (q_block_idx + 1) * BLOCK_M, BLOCK_N),
+                    T.ceildiv(cur_kv_seqlen, BLOCK_N),
+                )
+                if IS_BLOCK_ATTN
+                else T.ceildiv(cur_kv_seqlen, BLOCK_N)
+            )
+            for kv_block_idx in T.Pipelined(loop_range, num_stages=NUM_STAGES):
+                T.copy(
+                    K[kv_start_idx + kv_block_idx * BLOCK_N : kv_start_idx + (kv_block_idx + 1) * BLOCK_N, kv_head_idx, :],
+                    K_shared,
+                )
+
+                if IS_BLOCK_ATTN:
+                    for i, j in T.Parallel(BLOCK_M, BLOCK_N):
+                        num_diffusion_blocks = (q_block_idx * BLOCK_M + i) // DIFFUSION_BLOCK_SIZE + 1
+                        acc_score[i, j] = T.if_then_else(
+                            (num_diffusion_blocks * DIFFUSION_BLOCK_SIZE <= kv_block_idx * BLOCK_N + j)
+                            or (q_block_idx * BLOCK_M + i >= cur_q_seqlen or kv_block_idx * BLOCK_N + j >= cur_kv_seqlen),
+                            -1e9,
+                            0,
+                        )
+                else:
+                    for i, j in T.Parallel(BLOCK_M, BLOCK_N):
+                        acc_score[i, j] = T.if_then_else(
+                            (q_block_idx * BLOCK_M + i >= cur_q_seqlen or kv_block_idx * BLOCK_N + j >= cur_kv_seqlen),
+                            -1e9,
+                            0,
+                        )
+
+                T.gemm(Q_shared, K_shared, acc_score, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(ACCUM_DTYPE))
+                T.reduce_max(acc_score, scores_max, dim=1, clear=False)
+                for i in T.Parallel(BLOCK_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+
+                for i in T.parallel(BLOCK_M):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * SCALE - scores_max[i] * SCALE)
+
+                for i, j in T.Parallel(BLOCK_M, BLOCK_N):
+                    acc_score[i, j] = T.exp2(acc_score[i, j] * SCALE - scores_max[i] * SCALE)
+
+                T.reduce_sum(acc_score, scores_sum, dim=1)
+                for i in T.Parallel(BLOCK_M):
+                    log_sum[i] = log_sum[i] * scores_scale[i] + scores_sum[i]
+
+                T.copy(acc_score, acc_score_cast)
+                for i, j in T.Parallel(BLOCK_M, HEAD_DIM):
+                    acc_output[i, j] *= scores_scale[i]
+
+                T.copy(
+                    V[kv_start_idx + kv_block_idx * BLOCK_N : kv_start_idx + (kv_block_idx + 1) * BLOCK_N, kv_head_idx, :],
+                    V_shared,
+                )
+                T.gemm(acc_score_cast, V_shared, acc_output, policy=T.GemmWarpPolicy.FullRow)
+
+            for i, j in T.Parallel(BLOCK_M, HEAD_DIM):
+                acc_output[i, j] /= log_sum[i]
+
+            T.copy(acc_output, O_shared)
+            for i, d_idx in T.Parallel(BLOCK_M, HEAD_DIM):
+                if i + q_block_idx * BLOCK_M < cur_q_seqlen:
+                    O[i + q_start_idx + q_block_idx * BLOCK_M, head_idx, d_idx] = O_shared[i, d_idx]
+
+    return kernel
+
+
+def dllm_flash_attn_prefill_tilelang(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    scale: float,
+    attn_metadata: AttnMetaDataBase,
+) -> torch.Tensor:
+    """
+    TileLang-based prefill implementation (existing behavior).
+    Kept in a separate module so importing decode kernels doesn't require TileLang.
+    """
+    global kernel_config
+    if attn_metadata.attn_type == "full_attention":
+        return flash_attn_varlen_func(
+            q,
+            k,
+            v,
+            attn_metadata.cu_seqlens_q,
+            attn_metadata.cu_seqlens_k,
+            attn_metadata.max_seqlen_q,
+            attn_metadata.max_seqlen_k,
+            softmax_scale=scale,
+            block_table=None,
+        )
+    if attn_metadata.attn_type != "block_attention":
+        raise ValueError(f"Unsupported attn_type={attn_metadata.attn_type!r} for prefill")
+
+    if is_warming_up():
+        with set_autotune_inputs(
+            [
+                q,
+                k,
+                v,
+                attn_metadata.cu_seqlens_q,
+                attn_metadata.cu_seqlens_k,
+                attn_metadata.max_seqlen_q,
+            ]
+        ):
+            prefill_kernel = dllm_flash_attn_prefill_kernel(
+                attn_metadata.num_seqs,
+                q.shape[1] // k.shape[1],
+                q.shape[0],
+                k.shape[0],
+                q.shape[1],
+                q.shape[2],
+                attn_metadata.attn_type == "block_attention",
+                attn_metadata.diffusion_block_size,
+            )
+        kernel_config = prefill_kernel.config
+        return prefill_kernel(
+            q,
+            k,
+            v,
+            attn_metadata.cu_seqlens_q,
+            attn_metadata.cu_seqlens_k,
+            attn_metadata.max_seqlen_q,
+        )
+
+    config_kwargs = kernel_config if kernel_config is not None else {}
+    prefill_kernel = dllm_flash_attn_prefill_kernel(
+        attn_metadata.num_seqs,
+        q.shape[1] // k.shape[1],
+        q.shape[0],
+        k.shape[0],
+        q.shape[1],
+        q.shape[2],
+        attn_metadata.attn_type == "block_attention",
+        attn_metadata.diffusion_block_size,
+        **config_kwargs,
+    )
+    return prefill_kernel(
+        q,
+        k,
+        v,
+        attn_metadata.cu_seqlens_q,
+        attn_metadata.cu_seqlens_k,
+        attn_metadata.max_seqlen_q,
+    )
+
diff --git a/diffulex_kernel/python/paged_attn_decode_triton.py b/diffulex_kernel/python/paged_attn_decode_triton.py
new file mode 100644
index 0000000..1fabf19
--- /dev/null
+++ b/diffulex_kernel/python/paged_attn_decode_triton.py
@@ -0,0 +1,661 @@
+import torch
+import triton
+import triton.language as tl
+
+import os
+
+from diffulex.attention.metadata import AttnMetaDataBase
+
+
+@triton.jit
+def _paged_decode_attn_unified_bf16_cache_kernel(
+    q_ptr,
+    k_ptr,
+    v_ptr,
+    k_cache_ptr,
+    v_cache_ptr,
+    block_tables_ptr,
+    context_lens_ptr,
+    cu_seqlens_q_ptr,
+    o_ptr,
+    softmax_scale,  # fp32 scalar
+    # q/k/v/o strides
+    q_stride_s,
+    q_stride_h,
+    q_stride_d,
+    kv_stride_s,
+    kv_stride_h,
+    kv_stride_d,
+    o_stride_s,
+    o_stride_h,
+    o_stride_d,
+    # cache strides: [nblks, page, kvh, d]
+    k_cache_stride_nblks,
+    k_cache_stride_page,
+    k_cache_stride_h,
+    k_cache_stride_d,
+    v_cache_stride_nblks,
+    v_cache_stride_page,
+    v_cache_stride_h,
+    v_cache_stride_d,
+    # block_tables strides
+    block_tables_stride_s,
+    block_tables_stride_b,
+    # misc
+    NUM_GROUPS: tl.constexpr,
+    HEAD_DIM: tl.constexpr,
+    HEAD_DIM_PADDED: tl.constexpr,
+    PAGE_SIZE: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    pid_seq = tl.program_id(0)
+    pid_head = tl.program_id(1)
+    pid_m = tl.program_id(2)
+
+    kv_head = pid_head // NUM_GROUPS
+
+    q_start = tl.load(cu_seqlens_q_ptr + pid_seq).to(tl.int32)
+    q_end = tl.load(cu_seqlens_q_ptr + pid_seq + 1).to(tl.int32)
+    q_len = q_end - q_start
+    new_len = q_len  # decode path: current-step KV length matches query length
+    context_len = tl.load(context_lens_ptr + pid_seq).to(tl.int32)
+
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, HEAD_DIM_PADDED)
+    mask_m = offs_m < q_len
+    mask_d = offs_d < HEAD_DIM
+
+    q_offs = (q_start + offs_m[:, None]) * q_stride_s + pid_head * q_stride_h + offs_d[None, :] * q_stride_d
+    q = tl.load(q_ptr + q_offs, mask=mask_m[:, None] & mask_d[None, :], other=0.0).to(tl.bfloat16)
+
+    m = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    l = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, HEAD_DIM_PADDED], dtype=tl.float32)
+
+    # Cache stage: iterate only needed blocks (dynamic loop, like vLLM kernels).
+    offs_n_cache = tl.arange(0, BLOCK_N)
+    tok_off_cache = offs_n_cache
+    mask_n_cache = offs_n_cache < PAGE_SIZE
+
+    num_cache_blocks = (context_len + PAGE_SIZE - 1) // PAGE_SIZE
+    for blk in range(0, num_cache_blocks):
+        page = tl.load(block_tables_ptr + pid_seq * block_tables_stride_s + blk * block_tables_stride_b).to(tl.int32)
+        tok_base = blk * PAGE_SIZE
+        tok_idx = tok_base + tok_off_cache
+        valid_tok = (page >= 0) & (tok_idx < context_len) & mask_n_cache
+
+        k_offs = (
+            page * k_cache_stride_nblks
+            + tok_off_cache[:, None] * k_cache_stride_page
+            + kv_head * k_cache_stride_h
+            + offs_d[None, :] * k_cache_stride_d
+        )
+        k_blk = tl.load(
+            k_cache_ptr + k_offs,
+            mask=valid_tok[:, None] & mask_d[None, :],
+            other=0.0,
+        ).to(tl.bfloat16)
+
+        scores = tl.dot(q, tl.trans(k_blk)).to(tl.float32) * softmax_scale
+        scores = tl.where(mask_m[:, None] & valid_tok[None, :], scores, float("-inf"))
+
+        m_new = tl.maximum(m, tl.max(scores, axis=1))
+        p = tl.exp(scores - m_new[:, None])
+        l_new = l * tl.exp(m - m_new) + tl.sum(p, axis=1)
+        alpha = tl.exp(m - m_new)
+        acc *= alpha[:, None]
+
+        v_offs = (
+            page * v_cache_stride_nblks
+            + tok_off_cache[:, None] * v_cache_stride_page
+            + kv_head * v_cache_stride_h
+            + offs_d[None, :] * v_cache_stride_d
+        )
+        v_blk = tl.load(
+            v_cache_ptr + v_offs,
+            mask=valid_tok[:, None] & mask_d[None, :],
+            other=0.0,
+        ).to(tl.bfloat16)
+
+        acc += tl.dot(p.to(tl.bfloat16), v_blk).to(tl.float32)
+        m = m_new
+        l = l_new
+
+    # New KV stage (dynamic tiles)
+    kv_start = q_start
+    for start_n in range(0, new_len, BLOCK_N):
+        offs_n = start_n + tl.arange(0, BLOCK_N)
+        valid_tok = offs_n < new_len
+
+        k_offs = (kv_start + offs_n[None, :]) * kv_stride_s + kv_head * kv_stride_h + offs_d[:, None] * kv_stride_d
+        k_blk = tl.load(
+            k_ptr + k_offs,
+            mask=valid_tok[None, :] & mask_d[:, None],
+            other=0.0,
+        ).to(tl.bfloat16)
+
+        scores = tl.dot(q, k_blk).to(tl.float32) * softmax_scale
+        scores = tl.where(mask_m[:, None] & valid_tok[None, :], scores, float("-inf"))
+
+        m_new = tl.maximum(m, tl.max(scores, axis=1))
+        p = tl.exp(scores - m_new[:, None])
+        l_new = l * tl.exp(m - m_new) + tl.sum(p, axis=1)
+        alpha = tl.exp(m - m_new)
+        acc *= alpha[:, None]
+
+        v_offs = (kv_start + offs_n[:, None]) * kv_stride_s + kv_head * kv_stride_h + offs_d[None, :] * kv_stride_d
+        v_blk = tl.load(
+            v_ptr + v_offs,
+            mask=valid_tok[:, None] & mask_d[None, :],
+            other=0.0,
+        ).to(tl.bfloat16)
+
+        acc += tl.dot(p.to(tl.bfloat16), v_blk).to(tl.float32)
+        m = m_new
+        l = l_new
+
+    out = acc / l[:, None]
+    o_offs = (q_start + offs_m[:, None]) * o_stride_s + pid_head * o_stride_h + offs_d[None, :] * o_stride_d
+    tl.store(o_ptr + o_offs, out.to(tl.bfloat16), mask=mask_m[:, None] & mask_d[None, :])
+
+
+@triton.jit
+def _paged_decode_attn_unified_fp8_cache_kernel_legacy(
+    q_ptr,
+    k_ptr,
+    v_ptr,
+    k_cache_ptr,
+    v_cache_ptr,
+    k_scale_ptr,
+    v_scale_ptr,
+    block_tables_ptr,
+    context_lens_ptr,
+    cu_seqlens_q_ptr,
+    o_ptr,
+    softmax_scale,  # fp32 scalar
+    # q/k/v/o strides
+    q_stride_s,
+    q_stride_h,
+    q_stride_d,
+    kv_stride_s,
+    kv_stride_h,
+    kv_stride_d,
+    o_stride_s,
+    o_stride_h,
+    o_stride_d,
+    # cache strides: [nblks, page, kvh, d]
+    k_cache_stride_nblks,
+    k_cache_stride_page,
+    k_cache_stride_h,
+    k_cache_stride_d,
+    v_cache_stride_nblks,
+    v_cache_stride_page,
+    v_cache_stride_h,
+    v_cache_stride_d,
+    # block_tables strides
+    block_tables_stride_s,
+    block_tables_stride_b,
+    # misc
+    NUM_GROUPS: tl.constexpr,
+    HEAD_DIM: tl.constexpr,
+    HEAD_DIM_PADDED: tl.constexpr,
+    PAGE_SIZE: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    pid_seq = tl.program_id(0)
+    pid_head = tl.program_id(1)
+    pid_m = tl.program_id(2)
+
+    kv_head = pid_head // NUM_GROUPS
+    k_scale = tl.load(k_scale_ptr + kv_head).to(tl.float32)
+    v_scale = tl.load(v_scale_ptr + kv_head).to(tl.float32)
+
+    q_start = tl.load(cu_seqlens_q_ptr + pid_seq).to(tl.int32)
+    q_end = tl.load(cu_seqlens_q_ptr + pid_seq + 1).to(tl.int32)
+    q_len = q_end - q_start
+    new_len = q_len
+    context_len = tl.load(context_lens_ptr + pid_seq).to(tl.int32)
+
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, HEAD_DIM_PADDED)
+    mask_m = offs_m < q_len
+    mask_d = offs_d < HEAD_DIM
+
+    q_offs = (q_start + offs_m[:, None]) * q_stride_s + pid_head * q_stride_h + offs_d[None, :] * q_stride_d
+    q = tl.load(q_ptr + q_offs, mask=mask_m[:, None] & mask_d[None, :], other=0.0).to(tl.bfloat16)
+
+    m = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    l = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, HEAD_DIM_PADDED], dtype=tl.float32)
+
+    offs_n_cache = tl.arange(0, BLOCK_N)
+    tok_off_cache = offs_n_cache
+    mask_n_cache = offs_n_cache < PAGE_SIZE
+
+    num_cache_blocks = (context_len + PAGE_SIZE - 1) // PAGE_SIZE
+    for blk in range(0, num_cache_blocks):
+        page = tl.load(block_tables_ptr + pid_seq * block_tables_stride_s + blk * block_tables_stride_b).to(tl.int32)
+        tok_base = blk * PAGE_SIZE
+        tok_idx = tok_base + tok_off_cache
+        valid_tok = (page >= 0) & (tok_idx < context_len) & mask_n_cache
+
+        k_offs = (
+            page * k_cache_stride_nblks
+            + tok_off_cache[:, None] * k_cache_stride_page
+            + kv_head * k_cache_stride_h
+            + offs_d[None, :] * k_cache_stride_d
+        )
+        # fp8 cache values: dot(Q, K_fp8) * k_scale == dot(Q, (K_fp8*k_scale))
+        k_blk = tl.load(
+            k_cache_ptr + k_offs,
+            mask=valid_tok[:, None] & mask_d[None, :],
+            other=0.0,
+        ).to(tl.bfloat16)
+
+        scores = tl.dot(q, tl.trans(k_blk)).to(tl.float32) * (softmax_scale * k_scale)
+        scores = tl.where(mask_m[:, None] & valid_tok[None, :], scores, float("-inf"))
+
+        m_new = tl.maximum(m, tl.max(scores, axis=1))
+        p = tl.exp(scores - m_new[:, None])
+        l_new = l * tl.exp(m - m_new) + tl.sum(p, axis=1)
+        alpha = tl.exp(m - m_new)
+        acc *= alpha[:, None]
+
+        v_offs = (
+            page * v_cache_stride_nblks
+            + tok_off_cache[:, None] * v_cache_stride_page
+            + kv_head * v_cache_stride_h
+            + offs_d[None, :] * v_cache_stride_d
+        )
+        v_blk = tl.load(
+            v_cache_ptr + v_offs,
+            mask=valid_tok[:, None] & mask_d[None, :],
+            other=0.0,
+        ).to(tl.bfloat16)
+
+        # Apply v_scale on weights (cheaper than scaling V elementwise).
+        acc += tl.dot((p * v_scale).to(tl.bfloat16), v_blk).to(tl.float32)
+        m = m_new
+        l = l_new
+
+    kv_start = q_start
+    for start_n in range(0, new_len, BLOCK_N):
+        offs_n = start_n + tl.arange(0, BLOCK_N)
+        valid_tok = offs_n < new_len
+
+        k_offs = (kv_start + offs_n[None, :]) * kv_stride_s + kv_head * kv_stride_h + offs_d[:, None] * kv_stride_d
+        k_blk = tl.load(
+            k_ptr + k_offs,
+            mask=valid_tok[None, :] & mask_d[:, None],
+            other=0.0,
+        ).to(tl.bfloat16)
+
+        scores = tl.dot(q, k_blk).to(tl.float32) * softmax_scale
+        scores = tl.where(mask_m[:, None] & valid_tok[None, :], scores, float("-inf"))
+
+        m_new = tl.maximum(m, tl.max(scores, axis=1))
+        p = tl.exp(scores - m_new[:, None])
+        l_new = l * tl.exp(m - m_new) + tl.sum(p, axis=1)
+        alpha = tl.exp(m - m_new)
+        acc *= alpha[:, None]
+
+        v_offs = (kv_start + offs_n[:, None]) * kv_stride_s + kv_head * kv_stride_h + offs_d[None, :] * kv_stride_d
+        v_blk = tl.load(
+            v_ptr + v_offs,
+            mask=valid_tok[:, None] & mask_d[None, :],
+            other=0.0,
+        ).to(tl.bfloat16)
+
+        acc += tl.dot(p.to(tl.bfloat16), v_blk).to(tl.float32)
+        m = m_new
+        l = l_new
+
+    out = acc / l[:, None]
+    o_offs = (q_start + offs_m[:, None]) * o_stride_s + pid_head * o_stride_h + offs_d[None, :] * o_stride_d
+    tl.store(o_ptr + o_offs, out.to(tl.bfloat16), mask=mask_m[:, None] & mask_d[None, :])
+
+
+@triton.jit
+def _paged_decode_attn_unified_fp8_cache_fused_dot_kernel(
+    q_ptr,
+    k_ptr,
+    v_ptr,
+    k_cache_ptr,
+    v_cache_ptr,
+    k_scale_ptr,
+    v_scale_ptr,
+    block_tables_ptr,
+    context_lens_ptr,
+    cu_seqlens_q_ptr,
+    o_ptr,
+    softmax_scale,  # fp32 scalar
+    # q/k/v/o strides
+    q_stride_s,
+    q_stride_h,
+    q_stride_d,
+    kv_stride_s,
+    kv_stride_h,
+    kv_stride_d,
+    o_stride_s,
+    o_stride_h,
+    o_stride_d,
+    # cache strides: [nblks, page, kvh, d]
+    k_cache_stride_nblks,
+    k_cache_stride_page,
+    k_cache_stride_h,
+    k_cache_stride_d,
+    v_cache_stride_nblks,
+    v_cache_stride_page,
+    v_cache_stride_h,
+    v_cache_stride_d,
+    # block_tables strides
+    block_tables_stride_s,
+    block_tables_stride_b,
+    # misc
+    KV_FORMAT: tl.constexpr,
+    NUM_GROUPS: tl.constexpr,
+    HEAD_DIM: tl.constexpr,
+    HEAD_DIM_PADDED: tl.constexpr,
+    PAGE_SIZE: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    """
+    FP8-cache decode kernel with *fused* fp8 math:
+    - Keep KV cache tiles in float8 (via fp8 view tensor)
+    - Use tl.dot_scaled(..., rhs_format="e4m3/e5m2") to consume fp8 without explicit dequant tensors
+    - Apply per-head scalar scales (k_scale/v_scale) without elementwise dequantization
+    """
+    pid_seq = tl.program_id(0)
+    pid_head = tl.program_id(1)
+    pid_m = tl.program_id(2)
+
+    kv_head = pid_head // NUM_GROUPS
+    k_scale = tl.load(k_scale_ptr + kv_head).to(tl.float32)
+    v_scale = tl.load(v_scale_ptr + kv_head).to(tl.float32)
+
+    q_start = tl.load(cu_seqlens_q_ptr + pid_seq).to(tl.int32)
+    q_end = tl.load(cu_seqlens_q_ptr + pid_seq + 1).to(tl.int32)
+    q_len = q_end - q_start
+    new_len = q_len
+    context_len = tl.load(context_lens_ptr + pid_seq).to(tl.int32)
+
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, HEAD_DIM_PADDED)
+    mask_m = offs_m < q_len
+    mask_d = offs_d < HEAD_DIM
+
+    # Load Q (bf16). Note: triton 3.5 `tl.dot` does not support mixed bf16/fp16 x fp8.
+    # We use `tl.dot_scaled` (microscaling) to accept fp8 operands.
+    q_offs = (q_start + offs_m[:, None]) * q_stride_s + pid_head * q_stride_h + offs_d[None, :] * q_stride_d
+    q = tl.load(q_ptr + q_offs, mask=mask_m[:, None] & mask_d[None, :], other=0.0).to(tl.bfloat16)
+
+    m = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    l = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, HEAD_DIM_PADDED], dtype=tl.float32)
+
+    offs_n_cache = tl.arange(0, BLOCK_N)
+    tok_off_cache = offs_n_cache
+    mask_n_cache = offs_n_cache < PAGE_SIZE
+
+    num_cache_blocks = (context_len + PAGE_SIZE - 1) // PAGE_SIZE
+    for blk in range(0, num_cache_blocks):
+        page = tl.load(block_tables_ptr + pid_seq * block_tables_stride_s + blk * block_tables_stride_b).to(tl.int32)
+        tok_base = blk * PAGE_SIZE
+        tok_idx = tok_base + tok_off_cache
+        valid_tok = (page >= 0) & (tok_idx < context_len) & mask_n_cache
+
+        # K cache: keep fp8 element type; load as [K, N] to match dot_scaled rhs layout.
+        k_offs = (
+            page * k_cache_stride_nblks
+            + tok_off_cache[None, :] * k_cache_stride_page
+            + kv_head * k_cache_stride_h
+            + offs_d[:, None] * k_cache_stride_d
+        )
+        k_blk = tl.load(
+            k_cache_ptr + k_offs,
+            mask=mask_d[:, None] & valid_tok[None, :],
+            other=0.0,
+        )
+
+        # scores = QK^T * softmax_scale, with scalar k_scale applied after dot:
+        # dot(Q, K_true) == dot(Q, K_fp8) * k_scale (per-head scalar scale).
+        scores = tl.dot_scaled(
+            q,
+            None,
+            "bf16",
+            k_blk,
+            None,
+            KV_FORMAT,
+        ) * (softmax_scale * k_scale)
+        scores = tl.where(mask_m[:, None] & valid_tok[None, :], scores, float("-inf"))
+
+        m_new = tl.maximum(m, tl.max(scores, axis=1))
+        p = tl.exp(scores - m_new[:, None])
+        l_new = l * tl.exp(m - m_new) + tl.sum(p, axis=1)
+        alpha = tl.exp(m - m_new)
+        acc *= alpha[:, None]
+
+        # V cache: keep fp8 element type for tl.dot.
+        v_offs = (
+            page * v_cache_stride_nblks
+            + tok_off_cache[:, None] * v_cache_stride_page
+            + kv_head * v_cache_stride_h
+            + offs_d[None, :] * v_cache_stride_d
+        )
+        v_blk = tl.load(
+            v_cache_ptr + v_offs,
+            mask=valid_tok[:, None] & mask_d[None, :],
+            other=0.0,
+        )
+
+        # acc += P @ V_true == (P @ V_fp8) * v_scale
+        acc += tl.dot_scaled(
+            p.to(tl.float16),
+            None,
+            "fp16",
+            v_blk,
+            None,
+            KV_FORMAT,
+        ) * v_scale
+        m = m_new
+        l = l_new
+
+    # New KV stage (bf16 tensors, unchanged)
+    kv_start = q_start
+    for start_n in range(0, new_len, BLOCK_N):
+        offs_n = start_n + tl.arange(0, BLOCK_N)
+        valid_tok = offs_n < new_len
+
+        k_offs = (kv_start + offs_n[None, :]) * kv_stride_s + kv_head * kv_stride_h + offs_d[:, None] * kv_stride_d
+        k_blk = tl.load(
+            k_ptr + k_offs,
+            mask=valid_tok[None, :] & mask_d[:, None],
+            other=0.0,
+        ).to(tl.bfloat16)
+
+        scores = tl.dot(q, k_blk, out_dtype=tl.float32) * softmax_scale
+        scores = tl.where(mask_m[:, None] & valid_tok[None, :], scores, float("-inf"))
+
+        m_new = tl.maximum(m, tl.max(scores, axis=1))
+        p = tl.exp(scores - m_new[:, None])
+        l_new = l * tl.exp(m - m_new) + tl.sum(p, axis=1)
+        alpha = tl.exp(m - m_new)
+        acc *= alpha[:, None]
+
+        v_offs = (kv_start + offs_n[:, None]) * kv_stride_s + kv_head * kv_stride_h + offs_d[None, :] * kv_stride_d
+        v_blk = tl.load(
+            v_ptr + v_offs,
+            mask=valid_tok[:, None] & mask_d[None, :],
+            other=0.0,
+        ).to(tl.bfloat16)
+
+        acc += tl.dot(p.to(tl.bfloat16), v_blk, out_dtype=tl.float32)
+        m = m_new
+        l = l_new
+
+    out = acc / l[:, None]
+    o_offs = (q_start + offs_m[:, None]) * o_stride_s + pid_head * o_stride_h + offs_d[None, :] * o_stride_d
+    tl.store(o_ptr + o_offs, out.to(tl.bfloat16), mask=mask_m[:, None] & mask_d[None, :])
+
+
+def paged_attn_decode_unified_triton(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    attn_metadata: AttnMetaDataBase,
+    *,
+    softmax_scale: float,
+    fp8_cache: bool,
+) -> torch.Tensor:
+    """
+    Triton paged-attention decode for unified KV cache layout.
+
+    q: [total_q, num_heads, head_dim] (bf16)
+    k/v: [total_q, num_kv_heads, head_dim] (bf16), aligned with cu_seqlens_q
+    k_cache/v_cache:
+      - bf16: [num_page_blocks, page_size, num_kv_heads, head_dim]
+      - fp8 : same shape but dtype must be float8 view for triton (strategy.view_kv_cache_for_kernels)
+    """
+    assert q.is_cuda and k.is_cuda and v.is_cuda and k_cache.is_cuda and v_cache.is_cuda
+    assert q.dtype == torch.bfloat16 and k.dtype == torch.bfloat16 and v.dtype == torch.bfloat16
+    assert attn_metadata.block_tables is not None and attn_metadata.context_lens is not None and attn_metadata.cu_seqlens_q is not None
+    assert attn_metadata.kv_cache_layout == "unified", f"only unified layout supported, got {attn_metadata.kv_cache_layout}"
+
+    # Be robust to different metadata implementations (dataclass vs SimpleNamespace in tests).
+    num_seqs = int(attn_metadata.cu_seqlens_q.numel() - 1)
+    num_heads = q.shape[1]
+    head_dim = q.shape[2]
+    num_kv_heads = k.shape[1]
+    assert num_heads % num_kv_heads == 0
+    num_groups = num_heads // num_kv_heads
+
+    page_size = int(attn_metadata.page_block_size)
+
+    # Heuristics: BLOCK_M = 64 (supports diffusion_block_size=32/64), BLOCK_N = page_size/new-tile
+    BLOCK_M = 64
+    BLOCK_N = 32 if page_size <= 32 else 64
+    # Cache stage requires BLOCK_N == PAGE_SIZE to simplify; enforce.
+    if BLOCK_N != page_size:
+        BLOCK_N = page_size
+
+    head_dim_padded = 1 << (head_dim - 1).bit_length()
+
+    o = torch.empty_like(q)
+    grid = (num_seqs, num_heads, triton.cdiv(int(attn_metadata.max_seqlen_q), BLOCK_M))
+
+    if fp8_cache:
+        if attn_metadata.k_scale is None or attn_metadata.v_scale is None:
+            raise ValueError("fp8_cache=True requires attn_metadata.k_scale/v_scale")
+        # Default to fused fp8-dot kernel; fallback to legacy on compile/runtime failures.
+        # Set DIFFULEX_PAGED_DECODE_FP8_FUSED_DOT=0 to force legacy.
+        # Set DIFFULEX_PAGED_DECODE_FP8_FUSED_DOT_STRICT=1 to raise instead of fallback.
+        use_fused_dot = os.getenv("DIFFULEX_PAGED_DECODE_FP8_FUSED_DOT", "1") != "0"
+        strict_fused = os.getenv("DIFFULEX_PAGED_DECODE_FP8_FUSED_DOT_STRICT", "0") == "1"
+        if use_fused_dot:
+            # `tl.dot_scaled` needs the fp8 format string to interpret raw bytes correctly.
+            # Derive from the fp8 view dtype (torch.float8_*).
+            dt = str(k_cache.dtype)
+            if "e4m3" in dt:
+                kv_format = "e4m3"
+            elif "e5m2" in dt:
+                kv_format = "e5m2"
+            else:
+                raise ValueError(f"Unsupported fp8 k_cache dtype for fused-dot: {k_cache.dtype}")
+            try:
+                _paged_decode_attn_unified_fp8_cache_fused_dot_kernel[grid](
+                    q, k, v,
+                    k_cache, v_cache,
+                    attn_metadata.k_scale, attn_metadata.v_scale,
+                    attn_metadata.block_tables,
+                    attn_metadata.context_lens,
+                    attn_metadata.cu_seqlens_q,
+                    o,
+                    softmax_scale,
+                    *q.stride(), *k.stride(), *o.stride(),
+                    *k_cache.stride(), *v_cache.stride(),
+                    *attn_metadata.block_tables.stride(),
+                    KV_FORMAT=kv_format,
+                    NUM_GROUPS=num_groups,
+                    HEAD_DIM=head_dim,
+                    HEAD_DIM_PADDED=head_dim_padded,
+                    PAGE_SIZE=page_size,
+                    BLOCK_M=BLOCK_M,
+                    BLOCK_N=BLOCK_N,
+                    num_warps=4,
+                    num_stages=2,
+                )
+            except Exception:
+                if strict_fused:
+                    raise
+                _paged_decode_attn_unified_fp8_cache_kernel_legacy[grid](
+                    q, k, v,
+                    k_cache, v_cache,
+                    attn_metadata.k_scale, attn_metadata.v_scale,
+                    attn_metadata.block_tables,
+                    attn_metadata.context_lens,
+                    attn_metadata.cu_seqlens_q,
+                    o,
+                    softmax_scale,
+                    *q.stride(), *k.stride(), *o.stride(),
+                    *k_cache.stride(), *v_cache.stride(),
+                    *attn_metadata.block_tables.stride(),
+                    NUM_GROUPS=num_groups,
+                    HEAD_DIM=head_dim,
+                    HEAD_DIM_PADDED=head_dim_padded,
+                    PAGE_SIZE=page_size,
+                    BLOCK_M=BLOCK_M,
+                    BLOCK_N=BLOCK_N,
+                    num_warps=4,
+                    num_stages=2,
+                )
+        else:
+            _paged_decode_attn_unified_fp8_cache_kernel_legacy[grid](
+                q, k, v,
+                k_cache, v_cache,
+                attn_metadata.k_scale, attn_metadata.v_scale,
+                attn_metadata.block_tables,
+                attn_metadata.context_lens,
+                attn_metadata.cu_seqlens_q,
+                o,
+                softmax_scale,
+                *q.stride(), *k.stride(), *o.stride(),
+                *k_cache.stride(), *v_cache.stride(),
+                *attn_metadata.block_tables.stride(),
+                NUM_GROUPS=num_groups,
+                HEAD_DIM=head_dim,
+                HEAD_DIM_PADDED=head_dim_padded,
+                PAGE_SIZE=page_size,
+                BLOCK_M=BLOCK_M,
+                BLOCK_N=BLOCK_N,
+                num_warps=4,
+                num_stages=2,
+            )
+    else:
+        _paged_decode_attn_unified_bf16_cache_kernel[grid](
+            q, k, v,
+            k_cache, v_cache,
+            attn_metadata.block_tables,
+            attn_metadata.context_lens,
+            attn_metadata.cu_seqlens_q,
+            o,
+            softmax_scale,
+            *q.stride(), *k.stride(), *o.stride(),
+            *k_cache.stride(), *v_cache.stride(),
+            *attn_metadata.block_tables.stride(),
+            NUM_GROUPS=num_groups,
+            HEAD_DIM=head_dim,
+            HEAD_DIM_PADDED=head_dim_padded,
+            PAGE_SIZE=page_size,
+            BLOCK_M=BLOCK_M,
+            BLOCK_N=BLOCK_N,
+            num_warps=4,
+            num_stages=2,
+        )
+
+    return o
+
diff --git a/profile/analyze_trace_bottlenecks.py b/profile/analyze_trace_bottlenecks.py
new file mode 100644
index 0000000..41821d3
--- /dev/null
+++ b/profile/analyze_trace_bottlenecks.py
@@ -0,0 +1,298 @@
+#!/usr/bin/env python3
+"""
+Analyze huge torch chrome trace (streaming) to locate non-GEMM bottlenecks.
+
+Outputs:
+- duration of user_annotation "diffulex.generate(profiled)" (wall-ish)
+- GPU active time (union of kernel/memcpy/memset intervals) to estimate GPU idle gaps
+- top CUDA runtime/driver API calls by CPU time
+
+Designed to work without loading the >2GB JSON into memory.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from collections import Counter, defaultdict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+
+
+def _extract_str_after_key(line: str, key: str) -> Optional[str]:
+    k = f"\"{key}\""
+    pos = line.find(k)
+    if pos < 0:
+        return None
+    colon = line.find(":", pos + len(k))
+    if colon < 0:
+        return None
+    q1 = line.find('"', colon)
+    if q1 < 0:
+        return None
+    q2 = line.find('"', q1 + 1)
+    if q2 < 0:
+        return None
+    return line[q1 + 1 : q2]
+
+
+def _extract_num_after_key(line: str, key: str) -> Optional[float]:
+    k = f"\"{key}\""
+    pos = line.find(k)
+    if pos < 0:
+        return None
+    colon = line.find(":", pos + len(k))
+    if colon < 0:
+        return None
+    frag = line[colon + 1 :].strip()
+    comma = frag.find(",")
+    if comma >= 0:
+        frag = frag[:comma]
+    try:
+        return float(frag.strip())
+    except Exception:
+        return None
+
+
+def _extract_json_object_value(line: str, key: str) -> Optional[Any]:
+    """
+    Extract JSON object/array value following `"key":` on the same line.
+    Assumes the value is a JSON object {...} or array [...] and is fully contained in the line.
+    """
+    k = f"\"{key}\""
+    pos = line.find(k)
+    if pos < 0:
+        return None
+    colon = line.find(":", pos + len(k))
+    if colon < 0:
+        return None
+    # find first '{' or '[' after colon
+    start = None
+    for i in range(colon, len(line)):
+        if line[i] == "{":
+            start = i
+            open_ch, close_ch = "{", "}"
+            break
+        if line[i] == "[":
+            start = i
+            open_ch, close_ch = "[", "]"
+            break
+    if start is None:
+        return None
+    depth = 0
+    end = None
+    for i in range(start, len(line)):
+        ch = line[i]
+        if ch == open_ch:
+            depth += 1
+        elif ch == close_ch:
+            depth -= 1
+            if depth == 0:
+                end = i + 1
+                break
+    if end is None:
+        return None
+    frag = line[start:end]
+    try:
+        return json.loads(frag)
+    except Exception:
+        return None
+
+
+@dataclass
+class Interval:
+    start: float
+    end: float
+
+
+def _merge_intervals(intervals: List[Interval]) -> List[Interval]:
+    if not intervals:
+        return []
+    intervals.sort(key=lambda x: x.start)
+    merged: List[Interval] = [intervals[0]]
+    for it in intervals[1:]:
+        last = merged[-1]
+        if it.start <= last.end:
+            if it.end > last.end:
+                last.end = it.end
+        else:
+            merged.append(it)
+    return merged
+
+
+def analyze(trace_path: Path) -> Dict[str, Any]:
+    # union intervals for GPU activity across all streams
+    gpu_intervals: List[Interval] = []
+    gpu_min_ts: Optional[float] = None
+    gpu_max_end: Optional[float] = None
+
+    # also per stream, to detect if one stream is idle most of the time
+    gpu_intervals_by_stream: Dict[int, List[Interval]] = defaultdict(list)
+
+    # user annotation
+    generate_dur_us: Optional[float] = None
+
+    # runtime/driver api durations (cpu-side)
+    cuda_runtime: Counter[str] = Counter()
+    cuda_driver: Counter[str] = Counter()
+
+    in_events = False
+    in_obj = False
+    depth = 0
+    buf: List[str] = []
+
+    def _consume_event(text: str) -> None:
+        nonlocal generate_dur_us, gpu_min_ts, gpu_max_end
+        # quick checks without json parsing
+        if '"cat"' not in text or '"name"' not in text:
+            return
+        cat = None
+        name = None
+        # extract cat/name
+        # cat and name appear on first line typically, but safe on full text.
+        for line in text.splitlines():
+            if cat is None and '"cat"' in line:
+                v = _extract_str_after_key(line, "cat")
+                if v:
+                    cat = v
+            if name is None and '"name"' in line:
+                v = _extract_str_after_key(line, "name")
+                if v:
+                    name = v
+            if cat is not None and name is not None:
+                break
+        if cat is None or name is None:
+            return
+
+        if cat == "user_annotation" and name == "diffulex.generate(profiled)":
+            # duration in us
+            for line in text.splitlines():
+                if '"dur"' in line:
+                    d = _extract_num_after_key(line, "dur")
+                    if d is not None:
+                        generate_dur_us = d
+                        break
+            return
+
+        # cuda runtime/driver (CPU)
+        if cat == "cuda_runtime":
+            d = None
+            for line in text.splitlines():
+                if '"dur"' in line:
+                    d = _extract_num_after_key(line, "dur")
+                    break
+            if d is not None:
+                cuda_runtime[name] += d
+            return
+        if cat == "cuda_driver":
+            d = None
+            for line in text.splitlines():
+                if '"dur"' in line:
+                    d = _extract_num_after_key(line, "dur")
+                    break
+            if d is not None:
+                cuda_driver[name] += d
+            return
+
+        # GPU activity events
+        if cat in ("kernel", "gpu_memcpy", "gpu_memset"):
+            ts = None
+            dur = None
+            stream = None
+            for line in text.splitlines():
+                if ts is None and '"ts"' in line:
+                    ts = _extract_num_after_key(line, "ts")
+                if dur is None and '"dur"' in line:
+                    dur = _extract_num_after_key(line, "dur")
+                if stream is None and '"args"' in line and "stream" in line:
+                    # args is often multi-line; rely on json fragment extraction when seen
+                    pass
+            # extract args object to fetch stream quickly (safe, small)
+            args_obj = None
+            for line in text.splitlines():
+                if '"args"' in line:
+                    args_obj = _extract_json_object_value(line, "args")
+                    break
+            if isinstance(args_obj, dict):
+                try:
+                    stream = int(args_obj.get("stream", -1))
+                except Exception:
+                    stream = None
+            if ts is None or dur is None:
+                return
+            start = ts
+            end = ts + dur
+            gpu_intervals.append(Interval(start, end))
+            if stream is not None and stream >= 0:
+                gpu_intervals_by_stream[stream].append(Interval(start, end))
+            gpu_min_ts = start if gpu_min_ts is None else min(gpu_min_ts, start)
+            gpu_max_end = end if gpu_max_end is None else max(gpu_max_end, end)
+            return
+
+    with trace_path.open("r", encoding="utf-8", errors="replace") as f:
+        for line in f:
+            if not in_events:
+                if '"traceEvents"' in line and "[" in line:
+                    in_events = True
+                continue
+            if not in_obj:
+                if line.lstrip().startswith("{"):
+                    in_obj = True
+                    buf = [line]
+                    depth = line.count("{") - line.count("}")
+                else:
+                    if line.lstrip().startswith("]"):
+                        break
+                    continue
+            else:
+                buf.append(line)
+                depth += line.count("{") - line.count("}")
+            if in_obj and depth <= 0:
+                _consume_event("".join(buf))
+                in_obj = False
+
+    merged = _merge_intervals(gpu_intervals)
+    active_us = sum(it.end - it.start for it in merged)
+    span_us = (gpu_max_end - gpu_min_ts) if (gpu_min_ts is not None and gpu_max_end is not None) else 0.0
+
+    per_stream_active: Dict[int, float] = {}
+    for s, ints in gpu_intervals_by_stream.items():
+        m = _merge_intervals(ints)
+        per_stream_active[s] = sum(it.end - it.start for it in m)
+
+    top_runtime = cuda_runtime.most_common(30)
+    top_driver = cuda_driver.most_common(30)
+
+    return {
+        "trace": str(trace_path),
+        "generate_dur_us": generate_dur_us,
+        "gpu_active_union_us": active_us,
+        "gpu_span_us": span_us,
+        "gpu_active_ratio_union_over_span": (active_us / span_us) if span_us > 0 else None,
+        "gpu_active_ratio_union_over_generate": (active_us / generate_dur_us) if (generate_dur_us and generate_dur_us > 0) else None,
+        "gpu_span_over_generate": (span_us / generate_dur_us) if (generate_dur_us and generate_dur_us > 0) else None,
+        "gpu_event_count": len(gpu_intervals),
+        "gpu_stream_count": len(per_stream_active),
+        "top_cuda_runtime_us": top_runtime,
+        "top_cuda_driver_us": top_driver,
+        "top_stream_active_us": sorted(per_stream_active.items(), key=lambda kv: kv[1], reverse=True)[:10],
+    }
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--trace", type=str, required=True)
+    ap.add_argument("--out", type=str, required=True)
+    args = ap.parse_args()
+
+    res = analyze(Path(args.trace))
+    out_path = Path(args.out)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text(json.dumps(res, indent=2), encoding="utf-8")
+    print(f"[OK] wrote: {out_path}")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/profile/analyze_trace_cpu_ops.py b/profile/analyze_trace_cpu_ops.py
new file mode 100644
index 0000000..c08b05c
--- /dev/null
+++ b/profile/analyze_trace_cpu_ops.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+"""
+Stream-aggregate CPU-side durations from huge torch chrome traces.
+
+We aggregate:
+- cat=cpu_op
+- cat=python_function
+- cat=user_annotation
+
+This helps answer: where is the extra walltime coming from (outside CUDA kernels)?
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from collections import Counter, defaultdict
+from pathlib import Path
+from typing import Dict, Optional, Tuple
+
+
+def _extract_str_after_key(s: str, key: str) -> Optional[str]:
+    k = f"\"{key}\""
+    pos = s.find(k)
+    if pos < 0:
+        return None
+    colon = s.find(":", pos + len(k))
+    if colon < 0:
+        return None
+    q1 = s.find('"', colon)
+    if q1 < 0:
+        return None
+    q2 = s.find('"', q1 + 1)
+    if q2 < 0:
+        return None
+    return s[q1 + 1 : q2]
+
+
+def _extract_num_after_key(s: str, key: str) -> Optional[float]:
+    k = f"\"{key}\""
+    pos = s.find(k)
+    if pos < 0:
+        return None
+    colon = s.find(":", pos + len(k))
+    if colon < 0:
+        return None
+    frag = s[colon + 1 :].strip()
+    comma = frag.find(",")
+    if comma >= 0:
+        frag = frag[:comma]
+    try:
+        return float(frag.strip())
+    except Exception:
+        return None
+
+
+def analyze(trace_path: Path, cats: Tuple[str, ...]) -> Dict[str, Dict[str, Dict[str, float]]]:
+    # cat -> name -> (dur_us_sum, calls)
+    dur: Dict[str, Counter[str]] = {c: Counter() for c in cats}
+    calls: Dict[str, Counter[str]] = {c: Counter() for c in cats}
+
+    in_events = False
+    in_obj = False
+    depth = 0
+    buf = []
+
+    def consume(text: str) -> None:
+        if '"cat"' not in text or '"name"' not in text:
+            return
+        cat = None
+        name = None
+        d = None
+        for line in text.splitlines():
+            if cat is None and '"cat"' in line:
+                cat = _extract_str_after_key(line, "cat")
+            if name is None and '"name"' in line:
+                name = _extract_str_after_key(line, "name")
+            if d is None and '"dur"' in line:
+                d = _extract_num_after_key(line, "dur")
+            if cat and name and d is not None:
+                break
+        if cat not in cats or name is None:
+            return
+        calls[cat][name] += 1
+        if d is not None:
+            dur[cat][name] += d
+
+    with trace_path.open("r", encoding="utf-8", errors="replace") as f:
+        for line in f:
+            if not in_events:
+                if '"traceEvents"' in line and "[" in line:
+                    in_events = True
+                continue
+            if not in_obj:
+                if line.lstrip().startswith("{"):
+                    in_obj = True
+                    buf = [line]
+                    depth = line.count("{") - line.count("}")
+                else:
+                    if line.lstrip().startswith("]"):
+                        break
+                    continue
+            else:
+                buf.append(line)
+                depth += line.count("{") - line.count("}")
+            if in_obj and depth <= 0:
+                consume("".join(buf))
+                in_obj = False
+
+    out: Dict[str, Dict[str, Dict[str, float]]] = {}
+    for c in cats:
+        out[c] = {}
+        for name, total in dur[c].items():
+            out[c][name] = {
+                "dur_us": float(total),
+                "calls": float(calls[c][name]),
+                "avg_us": float(total) / float(calls[c][name]) if calls[c][name] else 0.0,
+            }
+    return out
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--trace", type=str, required=True)
+    ap.add_argument("--out", type=str, required=True)
+    ap.add_argument("--topk", type=int, default=50)
+    args = ap.parse_args()
+
+    cats = ("cpu_op", "python_function", "user_annotation")
+    res = analyze(Path(args.trace), cats)
+
+    # Write a compact report: per-cat topk by dur.
+    lines = []
+    lines.append(f"Trace: {args.trace}")
+    lines.append("")
+    for c in cats:
+        items = sorted(res[c].items(), key=lambda kv: kv[1]["dur_us"], reverse=True)[: args.topk]
+        lines.append(f"== {c} top {args.topk} by dur_us ==")
+        for name, st in items:
+            lines.append(f"{st['dur_us']:.3f} us  calls={int(st['calls'])}  avg={st['avg_us']:.3f} us  {name}")
+        lines.append("")
+
+    Path(args.out).write_text("\n".join(lines), encoding="utf-8")
+    print(f"[OK] wrote: {args.out}")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/profile/analyze_trace_gemm_shapes.py b/profile/analyze_trace_gemm_shapes.py
new file mode 100644
index 0000000..98a0a0d
--- /dev/null
+++ b/profile/analyze_trace_gemm_shapes.py
@@ -0,0 +1,309 @@
+#!/usr/bin/env python3
+"""
+Stream-parse PyTorch chrome trace JSON (very large) and aggregate GEMM shape
+distributions for selected ops.
+
+This script is designed for traces exported with record_shapes=True, where op
+events contain args["Input Dims"].
+
+Example:
+  python profile/analyze_trace_gemm_shapes.py \
+    --trace log/torch_profiles/20260125_023133/pytorch_trace_diffulex.generate(profiled).json \
+    --out  log/torch_profiles/20260125_023133/gemm_shapes_bf16.txt \
+    --ops  aten::mm aten::addmm
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+from collections import Counter, defaultdict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+
+
+def _parse_json_value_fragment(fragment: str) -> Any:
+    # fragment: after ':' in a JSON line, possibly ending with ',' and newline.
+    frag = fragment.strip()
+    if frag.endswith(","):
+        frag = frag[:-1]
+    return json.loads(frag)
+
+def _extract_json_array_after_key(line: str, key: str) -> Optional[Any]:
+    """
+    Extract and json-load the array value after `"key":` from a possibly
+    multi-field JSON line, e.g.
+      ..."Input Dims": [[1,2],[3,4]], "Ev Idx": 5
+    """
+    k = f"\"{key}\""
+    pos = line.find(k)
+    if pos < 0:
+        return None
+    colon = line.find(":", pos + len(k))
+    if colon < 0:
+        return None
+    # Find the first '[' after the colon.
+    start = line.find("[", colon)
+    if start < 0:
+        return None
+    depth = 0
+    end = -1
+    for i in range(start, len(line)):
+        ch = line[i]
+        if ch == "[":
+            depth += 1
+        elif ch == "]":
+            depth -= 1
+            if depth == 0:
+                end = i + 1
+                break
+    if end < 0:
+        return None
+    frag = line[start:end]
+    try:
+        return json.loads(frag)
+    except Exception:
+        return None
+
+
+def _extract_quoted_value(line: str) -> Optional[str]:
+    # very small helper: extract first "...".
+    i = line.find('"')
+    if i < 0:
+        return None
+    j = line.find('"', i + 1)
+    if j < 0:
+        return None
+    return line[i + 1 : j]
+
+
+def _extract_number_after_colon(line: str) -> Optional[float]:
+    # e.g.  "dur": 123.0,
+    if ":" not in line:
+        return None
+    frag = line.split(":", 1)[1].strip()
+    if frag.endswith(","):
+        frag = frag[:-1]
+    try:
+        return float(frag)
+    except Exception:
+        return None
+
+def _extract_number_after_key(line: str, key: str) -> Optional[float]:
+    """
+    Extract a numeric value after `"key":` from a possibly multi-field JSON line, e.g.
+      "ts": 123.0, "dur": 34.5,
+    """
+    k = f"\"{key}\""
+    pos = line.find(k)
+    if pos < 0:
+        return None
+    colon = line.find(":", pos + len(k))
+    if colon < 0:
+        return None
+    frag = line[colon + 1 :].strip()
+    # Cut at next comma if present.
+    comma = frag.find(",")
+    if comma >= 0:
+        frag = frag[:comma]
+    try:
+        return float(frag.strip())
+    except Exception:
+        return None
+
+
+def _dims_to_mnk(input_dims: Any) -> Optional[Tuple[int, int, int]]:
+    """
+    Convert args["Input Dims"] into a best-effort (M,N,K).
+    input_dims is typically a list where each element is [] (non-tensor) or
+    a list[int] (tensor dims).
+    """
+    if not isinstance(input_dims, list):
+        return None
+
+    tensor_dims: List[List[int]] = []
+    for d in input_dims:
+        if isinstance(d, list) and len(d) >= 2 and all(isinstance(x, (int, float)) for x in d):
+            tensor_dims.append([int(x) for x in d])
+    if len(tensor_dims) < 2:
+        return None
+
+    a = tensor_dims[0]
+    b = tensor_dims[1]
+    a_m, a_k = a[-2], a[-1]
+    # b could be [k, n] or [n, k] depending on transpose convention.
+    if len(b) >= 2 and a_k == b[-2]:
+        b_k, b_n = b[-2], b[-1]
+        return (a_m, b_n, a_k)
+    if len(b) >= 2 and a_k == b[-1]:
+        # b is [n, k]
+        b_n, b_k = b[-2], b[-1]
+        return (a_m, b_n, a_k)
+
+    # fallback: assume [k, n]
+    return (a_m, b[-1], a_k)
+
+
+@dataclass
+class ShapeStats:
+    calls: int = 0
+    dur_us: float = 0.0
+
+
+def iter_op_events(trace_path: Path, target_ops: set[str]) -> Iterable[Tuple[str, Optional[float], Any]]:
+    """
+    Yields (op_name, dur_us, input_dims) for events whose "name" is in target_ops.
+    Streaming + brace-depth parsing to avoid loading giant JSON into memory.
+    """
+    in_trace_events = False
+    in_event = False
+    depth = 0
+
+    name: Optional[str] = None
+    dur: Optional[float] = None
+    input_dims: Any = None
+    want = False
+
+    with trace_path.open("r", encoding="utf-8", errors="replace") as f:
+        for line in f:
+            if not in_trace_events:
+                if '"traceEvents"' in line and "[" in line:
+                    in_trace_events = True
+                continue
+
+            # Start of a JSON object event in traceEvents list.
+            if not in_event:
+                stripped = line.lstrip()
+                if stripped.startswith("{"):
+                    in_event = True
+                    depth = stripped.count("{") - stripped.count("}")
+                    name = None
+                    dur = None
+                    input_dims = None
+                    want = False
+                else:
+                    # End of traceEvents list.
+                    if line.lstrip().startswith("]"):
+                        break
+                    continue
+            else:
+                depth += line.count("{") - line.count("}")
+
+            # Parse fields we care about.
+            if '"name"' in line:
+                # Some traces put multiple fields on one line:
+                #   "ph": "X", "cat": "cpu_op", "name": "aten::mm", ...
+                key = '"name":'
+                pos = line.find(key)
+                if pos >= 0:
+                    q1 = line.find('"', pos + len(key))
+                    if q1 >= 0:
+                        q2 = line.find('"', q1 + 1)
+                        if q2 >= 0:
+                            name = line[q1 + 1 : q2]
+                            want = name in target_ops
+
+            if want and dur is None and '"dur"' in line:
+                dur = _extract_number_after_key(line, "dur")
+
+            if want and input_dims is None and "Input Dims" in line:
+                input_dims = _extract_json_array_after_key(line, "Input Dims")
+
+            # End of current event object (also works for single-line events).
+            if in_event and depth <= 0:
+                if want and name is not None:
+                    yield (name, dur, input_dims)
+                in_event = False
+
+
+def _human_int(n: float) -> str:
+    if n >= 1e9:
+        return f"{n/1e9:.3f}B"
+    if n >= 1e6:
+        return f"{n/1e6:.3f}M"
+    if n >= 1e3:
+        return f"{n/1e3:.3f}K"
+    return f"{int(n)}"
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser("Aggregate GEMM shapes from huge torch chrome trace")
+    ap.add_argument("--trace", type=str, required=True, help="Path to pytorch_trace_*.json")
+    ap.add_argument("--out", type=str, required=True, help="Output report path")
+    ap.add_argument("--ops", type=str, nargs="+", default=["aten::mm", "aten::addmm"], help="Op names to aggregate")
+    ap.add_argument("--topk", type=int, default=30)
+    args = ap.parse_args()
+
+    trace_path = Path(args.trace)
+    out_path = Path(args.out)
+    target_ops = set(args.ops)
+
+    # op -> (mnk -> stats)
+    agg: Dict[str, Dict[Tuple[int, int, int], ShapeStats]] = defaultdict(dict)
+    op_totals: Dict[str, ShapeStats] = defaultdict(ShapeStats)
+    op_unknown: Counter[str] = Counter()
+
+    for op, dur_us, input_dims in iter_op_events(trace_path, target_ops):
+        op_totals[op].calls += 1
+        if dur_us is not None:
+            op_totals[op].dur_us += dur_us
+
+        mnk = _dims_to_mnk(input_dims)
+        if mnk is None:
+            op_unknown[op] += 1
+            continue
+
+        st = agg[op].get(mnk)
+        if st is None:
+            st = ShapeStats()
+            agg[op][mnk] = st
+        st.calls += 1
+        if dur_us is not None:
+            st.dur_us += dur_us
+
+    lines: List[str] = []
+    lines.append(f"Trace: {trace_path}")
+    lines.append(f"Ops: {', '.join(sorted(target_ops))}")
+    lines.append("")
+
+    for op in sorted(target_ops):
+        tot = op_totals.get(op, ShapeStats())
+        lines.append(f"== {op} ==")
+        lines.append(f"total calls: {tot.calls}")
+        lines.append(f"total dur(us): {tot.dur_us:.3f}")
+        lines.append(f"unknown shapes: {op_unknown.get(op, 0)}")
+        lines.append("")
+
+        if op not in agg or not agg[op]:
+            lines.append("(no shape stats)\n")
+            continue
+
+        # Top by total dur
+        items = list(agg[op].items())
+        items_by_dur = sorted(items, key=lambda kv: kv[1].dur_us, reverse=True)[: args.topk]
+        lines.append(f"-- top {args.topk} shapes by total dur(us) --")
+        lines.append("M,N,K  calls  total_dur(us)  approx_GFLOP")
+        for (m, n, k), st in items_by_dur:
+            gflop = 2.0 * m * n * k / 1e9
+            lines.append(f"{m},{n},{k}  {st.calls}  {st.dur_us:.3f}  {gflop:.3f}")
+        lines.append("")
+
+        # Top by calls
+        items_by_calls = sorted(items, key=lambda kv: kv[1].calls, reverse=True)[: args.topk]
+        lines.append(f"-- top {args.topk} shapes by calls --")
+        lines.append("M,N,K  calls  total_dur(us)  avg_dur(us)")
+        for (m, n, k), st in items_by_calls:
+            avg = st.dur_us / st.calls if st.calls else 0.0
+            lines.append(f"{m},{n},{k}  {st.calls}  {st.dur_us:.3f}  {avg:.3f}")
+        lines.append("")
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text("\n".join(lines), encoding="utf-8")
+    print(f"[OK] wrote: {out_path}")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/test/python/kernel/test_paged_attn_decode_triton.py b/test/python/kernel/test_paged_attn_decode_triton.py
new file mode 100644
index 0000000..055dece
--- /dev/null
+++ b/test/python/kernel/test_paged_attn_decode_triton.py
@@ -0,0 +1,240 @@
+import pytest
+import torch
+import torch.nn.functional as F
+
+from einops import rearrange
+from types import SimpleNamespace
+
+from diffulex_kernel.python.paged_attn_decode_triton import paged_attn_decode_unified_triton
+
+
+def _has_fp8() -> bool:
+    return hasattr(torch, "float8_e4m3fn") or hasattr(torch, "float8_e4m3fnuz") or hasattr(torch, "float8_e5m2")
+
+
+def _build_cu_seqlens(lengths: torch.Tensor) -> torch.Tensor:
+    # lengths: [num_seqs] int32 on cuda
+    return torch.tensor(
+        [0] + list(torch.cumsum(lengths, dim=0).cpu().numpy()),
+        dtype=torch.int32,
+        device=lengths.device,
+    )
+
+
+def naive_sdpa_with_kvcache(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    block_tables: torch.Tensor,
+    context_lens: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    scale: float,
+    num_groups: int,
+    page_block_size: int,
+) -> torch.Tensor:
+    num_seqs = len(cu_seqlens_q) - 1
+    output = torch.zeros_like(q)
+    for seq_idx in range(num_seqs):
+        q_start = int(cu_seqlens_q[seq_idx].item())
+        q_end = int(cu_seqlens_q[seq_idx + 1].item())
+        kv_start = int(cu_seqlens_k[seq_idx].item())
+        kv_end = int(cu_seqlens_k[seq_idx + 1].item())
+
+        q_seq = q[q_start:q_end]  # [q_len, Hq, D]
+        k_seq = k[kv_start:kv_end]  # [new_len, Hkv, D]
+        v_seq = v[kv_start:kv_end]
+
+        ctx = int(context_lens[seq_idx].item())
+        k_cache_seq_list = []
+        v_cache_seq_list = []
+        for blk in range(block_tables.shape[1]):
+            page = int(block_tables[seq_idx, blk].item())
+            if page < 0:
+                continue
+            blk_start = blk * page_block_size
+            if blk_start >= ctx:
+                continue
+            blk_end = min(blk_start + page_block_size, ctx)
+            n = blk_end - blk_start
+            k_cache_seq_list.append(k_cache[page, :n])
+            v_cache_seq_list.append(v_cache[page, :n])
+
+        if k_cache_seq_list:
+            k_ctx = torch.cat(k_cache_seq_list, dim=0)
+            v_ctx = torch.cat(v_cache_seq_list, dim=0)
+            k_comb = torch.cat([k_ctx, k_seq], dim=0)
+            v_comb = torch.cat([v_ctx, v_seq], dim=0)
+        else:
+            k_comb = k_seq
+            v_comb = v_seq
+
+        q_sdpa = rearrange(q_seq, "s h d -> 1 h s d")
+        k_sdpa = rearrange(k_comb, "s h d -> 1 h s d")
+        v_sdpa = rearrange(v_comb, "s h d -> 1 h s d")
+        attn_out = F.scaled_dot_product_attention(
+            q_sdpa,
+            k_sdpa,
+            v_sdpa,
+            dropout_p=0.0,
+            is_causal=False,
+            scale=scale,
+            enable_gqa=True,
+        )
+        output[q_start:q_end] = rearrange(attn_out, "1 h s d -> s h d").to(output.dtype)
+
+    return output
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required for Triton paged-attention kernel")
+def test_paged_decode_triton_bf16_cache_matches_reference():
+    torch.manual_seed(0)
+    device = torch.device("cuda")
+
+    num_seqs = 4
+    num_heads = 32
+    num_kv_heads = 8
+    head_dim = 128
+    page_size = 32
+    diffusion_block_size = 32
+
+    num_groups = num_heads // num_kv_heads
+
+    # Per-seq query/new KV length (decode step)
+    q_lens = torch.full((num_seqs,), diffusion_block_size, dtype=torch.int32, device=device)
+    cu_q = _build_cu_seqlens(q_lens)
+    cu_k = cu_q.clone()
+    total_q = int(cu_q[-1].item())
+
+    # Context lengths (vary per seq)
+    context_lens = torch.tensor([0, 17, 63, 128], dtype=torch.int32, device=device)
+    max_ctx = int(context_lens.max().item())
+    max_seq_blocks = (max_ctx + page_size - 1) // page_size
+    num_page_blocks = num_seqs * max_seq_blocks
+
+    # Assign each seq its own contiguous pages
+    block_tables = torch.full((num_seqs, max_seq_blocks), -1, dtype=torch.int32, device=device)
+    for s in range(num_seqs):
+        for b in range(max_seq_blocks):
+            block_tables[s, b] = s * max_seq_blocks + b
+
+    q = torch.randn((total_q, num_heads, head_dim), device=device, dtype=torch.bfloat16)
+    k = torch.randn((total_q, num_kv_heads, head_dim), device=device, dtype=torch.bfloat16)
+    v = torch.randn_like(k)
+
+    k_cache = torch.randn((num_page_blocks, page_size, num_kv_heads, head_dim), device=device, dtype=torch.bfloat16)
+    v_cache = torch.randn_like(k_cache)
+
+    md = SimpleNamespace(
+        kv_cache_layout="unified",
+        block_tables=block_tables,
+        context_lens=context_lens,
+        cu_seqlens_q=cu_q,
+        max_seqlen_q=int(q_lens.max().item()),
+        page_block_size=page_size,
+    )
+    scale = 1.0 / (head_dim**0.5)
+
+    out = paged_attn_decode_unified_triton(q, k, v, k_cache, v_cache, md, softmax_scale=scale, fp8_cache=False)
+    ref = naive_sdpa_with_kvcache(
+        q,
+        k,
+        v,
+        k_cache,
+        v_cache,
+        block_tables,
+        context_lens,
+        cu_q,
+        cu_k,
+        scale,
+        num_groups,
+        page_size,
+    )
+
+    torch.testing.assert_close(out, ref, atol=1e-2, rtol=1e-2)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required for Triton paged-attention kernel")
+@pytest.mark.skipif(not _has_fp8(), reason="This torch build does not expose FP8 dtypes")
+def test_paged_decode_triton_fp8_cache_matches_reference():
+    torch.manual_seed(0)
+    device = torch.device("cuda")
+
+    fp8_dtype = torch.float8_e4m3fn if hasattr(torch, "float8_e4m3fn") else torch.float8_e5m2
+
+    num_seqs = 2
+    num_heads = 16
+    num_kv_heads = 4
+    head_dim = 128
+    page_size = 32
+    diffusion_block_size = 32
+    num_groups = num_heads // num_kv_heads
+
+    q_lens = torch.full((num_seqs,), diffusion_block_size, dtype=torch.int32, device=device)
+    cu_q = _build_cu_seqlens(q_lens)
+    cu_k = cu_q.clone()
+    total_q = int(cu_q[-1].item())
+
+    context_lens = torch.tensor([37, 55], dtype=torch.int32, device=device)
+    max_ctx = int(context_lens.max().item())
+    max_seq_blocks = (max_ctx + page_size - 1) // page_size
+    num_page_blocks = num_seqs * max_seq_blocks
+    block_tables = torch.full((num_seqs, max_seq_blocks), -1, dtype=torch.int32, device=device)
+    for s in range(num_seqs):
+        for b in range(max_seq_blocks):
+            block_tables[s, b] = s * max_seq_blocks + b
+
+    q = torch.randn((total_q, num_heads, head_dim), device=device, dtype=torch.bfloat16)
+    k = torch.randn((total_q, num_kv_heads, head_dim), device=device, dtype=torch.bfloat16)
+    v = torch.randn_like(k)
+
+    # Build BF16 "true" cache values, then quantize to FP8 as (x / scale) -> fp8, with per-head scales.
+    k_cache_true = torch.randn((num_page_blocks, page_size, num_kv_heads, head_dim), device=device, dtype=torch.bfloat16) * 0.5
+    v_cache_true = torch.randn_like(k_cache_true) * 0.5
+
+    eps = 1e-6
+    k_absmax = k_cache_true.to(torch.float32).abs().amax(dim=(0, 1, 3))
+    v_absmax = v_cache_true.to(torch.float32).abs().amax(dim=(0, 1, 3))
+    fp8_max = 448.0 if fp8_dtype == torch.float8_e4m3fn else 57344.0
+    k_scale = (k_absmax / fp8_max).clamp_min(eps).to(torch.float32)
+    v_scale = (v_absmax / fp8_max).clamp_min(eps).to(torch.float32)
+
+    k_cache_fp8 = (k_cache_true.to(torch.float32) / k_scale.view(1, 1, -1, 1)).to(fp8_dtype)
+    v_cache_fp8 = (v_cache_true.to(torch.float32) / v_scale.view(1, 1, -1, 1)).to(fp8_dtype)
+
+    md = SimpleNamespace(
+        kv_cache_layout="unified",
+        block_tables=block_tables,
+        context_lens=context_lens,
+        cu_seqlens_q=cu_q,
+        max_seqlen_q=int(q_lens.max().item()),
+        page_block_size=page_size,
+        k_scale=k_scale,
+        v_scale=v_scale,
+    )
+    scale = 1.0 / (head_dim**0.5)
+
+    out = paged_attn_decode_unified_triton(q, k, v, k_cache_fp8, v_cache_fp8, md, softmax_scale=scale, fp8_cache=True)
+
+    # Reference uses dequantized cache.
+    k_cache_deq = (k_cache_fp8.float() * k_scale.view(1, 1, -1, 1)).to(torch.bfloat16)
+    v_cache_deq = (v_cache_fp8.float() * v_scale.view(1, 1, -1, 1)).to(torch.bfloat16)
+    ref = naive_sdpa_with_kvcache(
+        q,
+        k,
+        v,
+        k_cache_deq,
+        v_cache_deq,
+        block_tables,
+        context_lens,
+        cu_q,
+        cu_k,
+        scale,
+        num_groups,
+        page_size,
+    )
+
+    torch.testing.assert_close(out, ref, atol=2e-2, rtol=2e-2)
+

From f6d0fa296e48eae387b63f16b1528061c5e7a877 Mon Sep 17 00:00:00 2001
From: luozixin2 <lzx@diffulex.local>
Date: Sun, 25 Jan 2026 13:36:40 +0000
Subject: [PATCH 07/10] refactor: remove CUDA Graph blockers and simplify
 linear quantization strategies

- Remove all .item() calls in LinearBase hot paths (GPU->CPU sync breaks graph capture)
  - Add Python-side meta cache (_offline_quant_*_py, _gptq_is_shuffled_py, etc.)
  - Use in-place fill_() + Python mirrors for state updates
- Simplify linear quantization strategies for future CUDA Graph support
  - Remove fast_path checks and redundant branching in linear_marlin_int8_w8a16
  - Remove fast_path in linear_int8_w8a8 (unified vLLM path)
  - Simplify linear_gptq_w4a16 (direct torch.ops._C.gptq_gemm call)
  - Make linear_fp8_w8a16 use explicit quant_scales parameter
- Fix FP8 weight layout: do not force contiguous for transpose-view (KxN stride0==1)
- Remove profiler record_function wrappers (graph-friendly)

Net: -129 lines, cleaner codebase ready for CUDA Graph capture
---
 diffulex/layer/linear.py                      |  91 +++++---
 .../strategies/linear_fp8_w8a16.py            |   8 +-
 .../strategies/linear_gptq_w4a16.py           |  57 ++---
 .../strategies/linear_int8_w8a8.py            |  53 +----
 .../strategies/linear_marlin_int8_w8a16.py    | 217 +++++-------------
 profile/torch_d2f_profiler.py                 |   5 +
 6 files changed, 151 insertions(+), 280 deletions(-)

diff --git a/diffulex/layer/linear.py b/diffulex/layer/linear.py
index e3581e9..5cc4b6d 100755
--- a/diffulex/layer/linear.py
+++ b/diffulex/layer/linear.py
@@ -134,19 +134,31 @@ def __init__(
         self.register_buffer("awq_marlin_zp", torch.empty(0, dtype=torch.int32), persistent=False)
         self.register_buffer("awq_marlin_workspace", torch.empty(0, dtype=torch.int32), persistent=False)
 
+        # ---- Python-side meta cache (CUDA Graph friendly) ----
+        # Avoid `.item()` on CUDA tensors in hot paths (it introduces GPU->CPU sync and breaks graph capture).
+        self._weight_is_quantized_py: bool = False
+        # 0=none, 1=gptq, 2=awq
+        self._offline_quant_format_py: int = 0
+        self._offline_quant_bits_py: int = 0
+        self._offline_quant_group_size_py: int = 128
+        self._offline_quant_out_features_py: int = 0
+        self._offline_quant_in_features_py: int = 0
+        self._gptq_is_shuffled_py: bool = False
+        self._gptq_marlin_is_prepared_py: bool = False
+        self._awq_marlin_is_prepared_py: bool = False
+
     def has_quantized_weight(self) -> bool:
-        return bool(self._weight_is_quantized.item()) and self.quant_weight_int8.numel() > 0 and self.quant_scales.numel() > 0
+        return self._weight_is_quantized_py and self.quant_weight_int8.numel() > 0 and self.quant_scales.numel() > 0
 
     def has_offline_quantized_weight(self) -> bool:
         """Check if offline quantized weights (GPTQ/AWQ) are present."""
-        format_val = int(self._offline_quant_format.item()) if self._offline_quant_format.numel() > 0 else 0
-        if format_val == 1:  # GPTQ
+        if self._offline_quant_format_py == 1:  # GPTQ
             return (
                 self.gptq_qweight.numel() > 0
                 and self.gptq_qzeros.numel() > 0
                 and self.gptq_scales.numel() > 0
             )
-        elif format_val == 2:  # AWQ
+        elif self._offline_quant_format_py == 2:  # AWQ
             return (
                 self.awq_qweight.numel() > 0
                 and self.awq_qzeros.numel() > 0
@@ -224,6 +236,8 @@ def _infer_module_device() -> torch.device:
         bits = 32 // pack_factor
         if format == "awq" and bits != 4:
             raise ValueError(f"AWQ 目前仅支持 4-bit（pack_factor=8），当前推断 bits={bits} (pack_factor={pack_factor})")
+        # Cache meta as Python primitives (graph-friendly).
+        self._offline_quant_bits_py = int(bits)
         # Record bits for downstream kernels (esp. marlin path).
         self._offline_quant_bits = torch.tensor(bits, dtype=torch.int32, device=module_device)
 
@@ -305,6 +319,8 @@ def _infer_module_device() -> torch.device:
                 self.gptq_g_idx = torch.empty(0, dtype=torch.int32, device=module_device)
             self._offline_quant_format = torch.tensor(1, dtype=torch.int8, device=module_device)
             self._gptq_is_shuffled = torch.tensor(False, dtype=torch.bool, device=module_device)
+            self._offline_quant_format_py = 1
+            self._gptq_is_shuffled_py = False
         else:  # AWQ
             self.awq_qweight = qweight
             self.awq_qzeros = qzeros
@@ -316,6 +332,8 @@ def _infer_module_device() -> torch.device:
             self.gptq_g_idx = torch.empty(0, dtype=torch.int32, device=module_device)
             self._offline_quant_format = torch.tensor(2, dtype=torch.int8, device=module_device)
             self._gptq_is_shuffled = torch.tensor(False, dtype=torch.bool, device=module_device)
+            self._offline_quant_format_py = 2
+            self._gptq_is_shuffled_py = False
 
         # Reset marlin-prep caches (weights may have changed / moved).
         self._gptq_marlin_is_prepared = torch.tensor(False, dtype=torch.bool, device=module_device)
@@ -334,6 +352,12 @@ def _infer_module_device() -> torch.device:
         self._offline_quant_group_size = torch.tensor(group_size, dtype=torch.int32, device=module_device)
         self._offline_quant_out_features = torch.tensor(out_features, dtype=torch.int32, device=module_device)
         self._offline_quant_in_features = torch.tensor(in_features, dtype=torch.int32, device=module_device)
+        # Python meta mirrors.
+        self._offline_quant_group_size_py = int(group_size)
+        self._offline_quant_out_features_py = int(out_features)
+        self._offline_quant_in_features_py = int(in_features)
+        self._gptq_marlin_is_prepared_py = False
+        self._awq_marlin_is_prepared_py = False
 
         # Drop bf16 weight Parameter if present (to free memory)
         if "weight" in self._parameters:
@@ -342,13 +366,11 @@ def _infer_module_device() -> torch.device:
 
     def _maybe_prepare_offline_gptq(self, x: torch.Tensor) -> None:
         """Prepare vLLM GPTQ weights on first use (required gptq_shuffle)."""
-        if self._offline_quant_format.numel() == 0:
-            return
-        if int(self._offline_quant_format.item()) != 1:
+        if self._offline_quant_format_py != 1:
             return
         if self.gptq_qweight.numel() == 0:
             return
-        if self._gptq_is_shuffled.numel() > 0 and bool(self._gptq_is_shuffled.item()):
+        if self._gptq_is_shuffled_py:
             return
 
         # Lazy import to avoid pulling vLLM unless GPTQ offline weights are used.
@@ -373,7 +395,7 @@ def _maybe_prepare_offline_gptq(self, x: torch.Tensor) -> None:
 
         # Infer weight_bits from packed qweight shape to support GPTQ W2/W4/W8.
         # qweight: [K/pack_factor, N], where pack_factor = 32 / weight_bits.
-        in_features = int(self._offline_quant_in_features.item()) if self._offline_quant_in_features.numel() > 0 else None
+        in_features = int(self._offline_quant_in_features_py)
         if in_features is None or in_features <= 0:
             raise RuntimeError("GPTQ offline 权重已加载，但无法推断 in_features 以计算 weight_bits。")
         if self.gptq_qweight.shape[0] <= 0 or in_features % int(self.gptq_qweight.shape[0]) != 0:
@@ -389,20 +411,20 @@ def _maybe_prepare_offline_gptq(self, x: torch.Tensor) -> None:
             )
         weight_bits = 32 // pack_factor
         ops.gptq_shuffle(self.gptq_qweight, g_idx, weight_bits)
-        self._gptq_is_shuffled = torch.tensor(True, dtype=torch.bool, device=x.device)
+        # Do NOT create new tensors on hot paths; update in-place + python mirror.
+        self._gptq_is_shuffled.fill_(True)
+        self._gptq_is_shuffled_py = True
 
     def _maybe_prepare_offline_gptq_marlin(self, x: torch.Tensor) -> None:
         """Prepare vLLM GPTQ Marlin weights on first use (repack + permute scales/zp).
 
         IMPORTANT: This path must NOT call `gptq_shuffle` (that is specific to gptq_gemm/exllama).
         """
-        if self._offline_quant_format.numel() == 0:
-            return
-        if int(self._offline_quant_format.item()) != 1:
+        if self._offline_quant_format_py != 1:
             return
         if self.gptq_qweight.numel() == 0:
             return
-        if self._gptq_marlin_is_prepared.numel() > 0 and bool(self._gptq_marlin_is_prepared.item()):
+        if self._gptq_marlin_is_prepared_py:
             return
 
         try:
@@ -425,9 +447,9 @@ def _maybe_prepare_offline_gptq_marlin(self, x: torch.Tensor) -> None:
                 "请确保模型与输入在同一设备。"
             )
 
-        in_features = int(self._offline_quant_in_features.item()) if self._offline_quant_in_features.numel() > 0 else 0
-        out_features = int(self._offline_quant_out_features.item()) if self._offline_quant_out_features.numel() > 0 else 0
-        group_size = int(self._offline_quant_group_size.item()) if self._offline_quant_group_size.numel() > 0 else 128
+        in_features = int(self._offline_quant_in_features_py)
+        out_features = int(self._offline_quant_out_features_py)
+        group_size = int(self._offline_quant_group_size_py)
         if in_features <= 0 or out_features <= 0:
             raise RuntimeError(
                 f"GPTQ Marlin: invalid feature sizes: in_features={in_features}, out_features={out_features}"
@@ -436,7 +458,7 @@ def _maybe_prepare_offline_gptq_marlin(self, x: torch.Tensor) -> None:
         # Determine weight_bits.
         # - Standard GPTQ layout: infer from qweight K packing.
         # - Marlin-exported layout: bits cannot be inferred from qweight shape; use recorded bits.
-        weight_bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 0
+        weight_bits = int(self._offline_quant_bits_py)
         if weight_bits <= 0:
             if self.gptq_qweight.shape[0] <= 0 or in_features % int(self.gptq_qweight.shape[0]) != 0:
                 raise RuntimeError(
@@ -503,17 +525,16 @@ def _maybe_prepare_offline_gptq_marlin(self, x: torch.Tensor) -> None:
         # Use empty zp to keep has_zp=False in the kernel.
         self.gptq_marlin_zp = marlin_make_empty_g_idx(device)
 
-        self._gptq_marlin_is_prepared = torch.tensor(True, dtype=torch.bool, device=device)
+        self._gptq_marlin_is_prepared.fill_(True)
+        self._gptq_marlin_is_prepared_py = True
 
     def _maybe_prepare_offline_awq_marlin(self, x: torch.Tensor) -> None:
         """Prepare vLLM AWQ Marlin weights on first use (repack + permute scales/zp)."""
-        if self._offline_quant_format.numel() == 0:
-            return
-        if int(self._offline_quant_format.item()) != 2:
+        if self._offline_quant_format_py != 2:
             return
         if self.awq_qweight.numel() == 0:
             return
-        if self._awq_marlin_is_prepared.numel() > 0 and bool(self._awq_marlin_is_prepared.item()):
+        if self._awq_marlin_is_prepared_py:
             return
 
         try:
@@ -535,9 +556,9 @@ def _maybe_prepare_offline_awq_marlin(self, x: torch.Tensor) -> None:
                 "请确保模型与输入在同一设备。"
             )
 
-        in_features = int(self._offline_quant_in_features.item()) if self._offline_quant_in_features.numel() > 0 else 0
-        out_features = int(self._offline_quant_out_features.item()) if self._offline_quant_out_features.numel() > 0 else 0
-        group_size = int(self._offline_quant_group_size.item()) if self._offline_quant_group_size.numel() > 0 else 128
+        in_features = int(self._offline_quant_in_features_py)
+        out_features = int(self._offline_quant_out_features_py)
+        group_size = int(self._offline_quant_group_size_py)
         if in_features <= 0 or out_features <= 0:
             raise RuntimeError(
                 f"AWQ Marlin: invalid feature sizes: in_features={in_features}, out_features={out_features}"
@@ -579,7 +600,8 @@ def _maybe_prepare_offline_awq_marlin(self, x: torch.Tensor) -> None:
             is_a_8bit=False,
         ).contiguous()
 
-        self._awq_marlin_is_prepared = torch.tensor(True, dtype=torch.bool, device=device)
+        self._awq_marlin_is_prepared.fill_(True)
+        self._awq_marlin_is_prepared_py = True
 
     def set_quantized_weight(self, quant_weight_int8: torch.Tensor, quant_scales: torch.Tensor) -> None:
         # Support:
@@ -617,6 +639,8 @@ def set_quantized_weight(self, quant_weight_int8: torch.Tensor, quant_scales: to
             # FP8 W8A16 uses float32 scales
             if weight_format in ("fp8_e4m3", "fp8_e5m2") and act_format == "bf16":
                 scale_dtype = torch.float32
+                # Keep KxN transpose-view layout (do NOT force contiguous) for vLLM FP8 kernels.
+                force_weight_contig = False
             # W8A8 int8 uses float32 [1, N] weight scales in vLLM cutlass_scaled_mm path.
             elif weight_format == "int8" and act_format == "int8":
                 scale_dtype = torch.float32
@@ -644,6 +668,7 @@ def set_quantized_weight(self, quant_weight_int8: torch.Tensor, quant_scales: to
         # 1xN view for fused kernels expecting 2D scales.
         self.quant_scales_1xn = quant_scales if quant_scales.dim() == 2 else quant_scales.view(1, -1)
         self._weight_is_quantized.fill_(True)
+        self._weight_is_quantized_py = True
 
     def _maybe_promote_weight_to_quantized_at_runtime(
         self,
@@ -744,9 +769,9 @@ def _get_linear_strategy(self):
     def _offline_meta(self) -> tuple[int, int, int]:
         """Return (out_features, in_features, group_size) for offline GPTQ/AWQ."""
         return (
-            int(self._offline_quant_out_features.item()),
-            int(self._offline_quant_in_features.item()),
-            int(self._offline_quant_group_size.item()),
+            int(self._offline_quant_out_features_py),
+            int(self._offline_quant_in_features_py),
+            int(self._offline_quant_group_size_py),
         )
 
     def _infer_gptq_weight_bits(self, *, in_features: int) -> int:
@@ -756,7 +781,7 @@ def _infer_gptq_weight_bits(self, *, in_features: int) -> int:
         - use recorded bits (e.g., marlin-exported layouts),
         - otherwise infer from qweight packing.
         """
-        bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 0
+        bits = int(self._offline_quant_bits_py)
         if bits > 0:
             return bits
         if self.gptq_qweight.numel() == 0:
@@ -783,7 +808,7 @@ def _build_offline_forward_kwargs(self, x: torch.Tensor, strategy) -> dict:
         if strategy is None:
             raise RuntimeError("Offline quantized weight is present but no linear strategy is configured.")
 
-        format_val = int(self._offline_quant_format.item())
+        format_val = int(self._offline_quant_format_py)
         weight_format = getattr(strategy, "linear_weight_format", None)
         out_features, in_features, group_size = self._offline_meta()
 
@@ -887,7 +912,7 @@ def _forward_base(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.
 
             if weight_format == "awq":
                 # AWQ is 4-bit only in vLLM; bits stored in _offline_quant_bits.
-                bits = int(self._offline_quant_bits.item()) if self._offline_quant_bits.numel() > 0 else 4
+                bits = int(self._offline_quant_bits_py) if int(self._offline_quant_bits_py) > 0 else 4
                 pack_factor = 32 // max(1, bits)
                 return strategy.linear_forward(
                     x,
diff --git a/diffulex/utils/quantization/strategies/linear_fp8_w8a16.py b/diffulex/utils/quantization/strategies/linear_fp8_w8a16.py
index 85048d8..b25cf99 100644
--- a/diffulex/utils/quantization/strategies/linear_fp8_w8a16.py
+++ b/diffulex/utils/quantization/strategies/linear_fp8_w8a16.py
@@ -108,12 +108,10 @@ def linear_forward(
         bias: Optional[torch.Tensor],
         *,
         quant_kind: str,
-        **kwargs: Any,
+        quant_scales: Optional[torch.Tensor] = None,
+        out_features: Optional[int] = None,
     ) -> torch.Tensor:
-        _ = quant_kind
-        from vllm.platforms import current_platform  # type: ignore
-
-        quant_scales = kwargs.get("quant_scales", None)
+        _ = quant_kind, out_features
         if weight is not None and quant_scales is not None:
             # Expected: weight is fp8 K×N tensor (transpose-view is fine).
             q_kn = weight.to(device=x.device)
diff --git a/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py b/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py
index f0a7a98..95e5b9e 100644
--- a/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py
+++ b/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py
@@ -112,65 +112,34 @@ def linear_forward(
                 raise RuntimeError("GPTQ offline weights missing packed tensors and bf16 weight is not present.")
             return F.linear(x, weight, bias)
 
+        if weight_bits <= 0:
+            raise RuntimeError("GPTQ requires explicit weight_bits (>0) for the CUDA kernel path.")
+
         # vLLM GPTQ kernels expect FP16 activations.
         x_in = x if x.dtype == torch.float16 else x.to(dtype=torch.float16)
+        x2 = x_in.reshape(-1, x_in.shape[-1]) if x_in.dim() != 2 else x_in
+        if not x2.is_contiguous():
+            x2 = x2.contiguous()
 
-        # ---- Fast path ----
-        if (
-            x_in.dim() == 2
-            and x_in.is_contiguous()
-            and qweight.device == x.device
-            and qzeros.device == x.device
-            and scales.device == x.device
-            and qweight.dtype == torch.int32
-            and qzeros.dtype == torch.int32
-            and scales.dtype == torch.float16
-            and qweight.is_contiguous()
-            and qzeros.is_contiguous()
-            and scales.is_contiguous()
-            and weight_bits > 0
-        ):
-            if g_idx is None or (isinstance(g_idx, torch.Tensor) and g_idx.numel() == 0):
-                g_idx_t = torch.empty((0,), device=x.device, dtype=torch.int)
-            else:
-                # Prefer already-correct dtype/device to avoid per-call copies.
-                g_idx_t = g_idx if (g_idx.device == x.device and g_idx.dtype == torch.int) else g_idx.to(device=x.device, dtype=torch.int)
-            n = int(out_features) if out_features is not None else int(qweight.shape[-1])
-            output = torch.ops._C.gptq_gemm(
-                x_in,
-                qweight,
-                qzeros,
-                scales,
-                g_idx_t,
-                True,
-                bool(use_v2_format),
-                int(weight_bits),
-            )
-            if bias is not None:
-                output.add_(bias.to(dtype=output.dtype))
-            # Output is [M,N]
-            return output.to(dtype=x.dtype) if output.dtype != x.dtype else output
-
-        out_shape = x.shape[:-1] + (int(out_features) if out_features is not None else int(qweight.shape[-1]),)
-        reshaped_x = x_in.reshape(-1, x_in.shape[-1])
-        if g_idx is None or (isinstance(g_idx, torch.Tensor) and g_idx.numel() == 0):
+        if g_idx is None or g_idx.numel() == 0:
             g_idx_t = torch.empty((0,), device=x.device, dtype=torch.int)
         else:
-            g_idx_t = g_idx.to(device=x.device, dtype=torch.int)
+            g_idx_t = g_idx if (g_idx.device == x.device and g_idx.dtype == torch.int) else g_idx.to(device=x.device, dtype=torch.int)
 
-        output = ops.gptq_gemm(
-            reshaped_x,
+        output = torch.ops._C.gptq_gemm(
+            x2,
             qweight,
             qzeros,
             scales,
             g_idx_t,
             True,  # use_exllama
             bool(use_v2_format),
-            int(weight_bits) if weight_bits > 0 else 4,
+            int(weight_bits),
         )
         if bias is not None:
             output.add_(bias.to(dtype=output.dtype))
+
+        out_shape = x.shape[:-1] + (int(out_features) if out_features is not None else int(qweight.shape[-1]),)
         output = output.reshape(out_shape)
-        # Keep output dtype consistent with input activations for downstream layers.
         return output.to(dtype=x.dtype) if output.dtype != x.dtype else output
 
diff --git a/diffulex/utils/quantization/strategies/linear_int8_w8a8.py b/diffulex/utils/quantization/strategies/linear_int8_w8a8.py
index ae62b64..ba07440 100644
--- a/diffulex/utils/quantization/strategies/linear_int8_w8a8.py
+++ b/diffulex/utils/quantization/strategies/linear_int8_w8a8.py
@@ -38,12 +38,6 @@ def __init__(self) -> None:
         super().__init__()
         # Cache: id(weight) -> (qweight_int8 [N,K], w_scales_fp32 [N])
         self._weight_cache: dict[int, tuple[torch.Tensor, torch.Tensor]] = {}
-        self._ops_available: bool = bool(
-            _vllm_ops is not None
-            and hasattr(torch.ops, "_C")
-            and hasattr(torch.ops._C, "dynamic_scaled_int8_quant")
-            and hasattr(torch.ops._C, "cutlass_scaled_mm")
-        )
 
     @property
     def name(self) -> str:
@@ -115,44 +109,13 @@ def linear_forward(
         out_features: Optional[int] = None,
     ) -> torch.Tensor:
         _ = quant_kind
+        if _vllm_ops is None:
+            raise RuntimeError("vLLM custom ops are required for W8A8 (scaled_int8_quant / cutlass_scaled_mm).")
 
-        # ---- Fast path (decode hot path) ----
-        # Preconditions are strict to minimize Python overhead.
-        # Expect:
-        # - qweight: int8 KxN with stride(0)==1
-        # - w_scales: float32 [1,N], contiguous
-        if (
-            self._ops_available
-            and _vllm_ops is not None
-            and x.dim() == 2
-            and x.device.type == "cuda"
-            and x.dtype in (torch.bfloat16, torch.float16)
-            and x.is_contiguous()
-            and weight is not None
-            and weight.dtype == torch.int8
-            and weight.device == x.device
-            and weight.stride(0) == 1
-            and quant_scales is not None
-            and quant_scales.device == x.device
-            and quant_scales.dtype == torch.float32
-            and quant_scales.dim() == 2
-            and quant_scales.is_contiguous()
-        ):
-            m, _k = x.shape
-            # Optionally validate N to catch wrong metadata early.
-            if out_features is None or int(out_features) == int(quant_scales.shape[1]):
-                x_q = torch.empty((m, _k), device=x.device, dtype=torch.int8)
-                x_s = torch.empty((m, 1), device=x.device, dtype=torch.float32)
-                torch.ops._C.dynamic_scaled_int8_quant(x_q, x, x_s, None)
-                out = torch.empty((m, int(quant_scales.shape[1])), device=x.device, dtype=x.dtype)
-                torch.ops._C.cutlass_scaled_mm(out, x_q, weight, x_s, quant_scales, bias)
-                return out
-
-        # If weight already quantized by LinearBase.load-time quantization.
+        # Weight/scales: prefer load-time quantized buffers.
         if weight is not None and weight.dtype == torch.int8 and quant_scales is not None:
-            # Expected: qweight is K×N int8 (may be non-contiguous), quant_scales is [1,N] fp32
             qweight = weight
-            w_scales = quant_scales.to(dtype=torch.float32)
+            w_scales = quant_scales
         else:
             wid = id(weight)
             cached = self._weight_cache.get(wid)
@@ -164,13 +127,15 @@ def linear_forward(
             else:
                 qweight, w_scales = cached
 
-        # Flatten like torch.nn.functional.linear
         orig_shape = x.shape
         x2 = x.reshape(-1, x.shape[-1]) if x.dim() != 2 else x
         if x2.dtype not in (torch.bfloat16, torch.float16):
             x2 = x2.to(torch.bfloat16)
-        # dynamic per-token int8 quant + fused GEMM_DQ
-        x_q, x_s, _ = _vllm_ops.scaled_int8_quant(x2.contiguous(), scale=None, azp=None, symmetric=True)
+        if not x2.is_contiguous():
+            x2 = x2.contiguous()
+
+        # dynamic per-token int8 quant + fused GEMM+dequant
+        x_q, x_s, _ = _vllm_ops.scaled_int8_quant(x2, scale=None, azp=None, symmetric=True)
         y = _vllm_ops.cutlass_scaled_mm(
             x_q,
             qweight,
diff --git a/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py b/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py
index fe99904..c2ff1ce 100644
--- a/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py
+++ b/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py
@@ -35,15 +35,6 @@ def _allspark_is_available() -> bool:
         and hasattr(_vllm_ops, "allspark_repack_weight")
     )
 
-
-def _allspark_w8a16_gemm(*args, **kwargs):
-    if _vllm_ops is None or not hasattr(_vllm_ops, "allspark_w8a16_gemm"):
-        raise RuntimeError("vLLM custom ops are unavailable: missing `allspark_w8a16_gemm`.")
-    # Narrow profiler range to isolate Python wrapper overhead vs kernel time.
-    with torch.profiler.record_function("w8a16/allspark_w8a16_gemm(pybind)"):
-        return _vllm_ops.allspark_w8a16_gemm(*args, **kwargs)
-
-
 def _allspark_repack_weight(b_qweight_kn: torch.Tensor, scales_1xn: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
     """Repack KxN uint8 qweight + 1xN scales into (N_32,K) + (1,N_32) for AllSpark GEMM."""
     if _vllm_ops is None or not hasattr(_vllm_ops, "allspark_repack_weight"):
@@ -262,158 +253,76 @@ def linear_forward(
         out_features: Optional[int] = None,
     ) -> torch.Tensor:
         _ = quant_kind
-
-        # ---- Fast path (decode hot path) ----
-        # Goal: make Python-side overhead close to a single custom-op call (+ optional bias add).
-        # Preconditions are intentionally strict; otherwise we fall back to the fully-checked path.
-        #
-        # Notes:
-        # - We call `_vllm_ops.allspark_w8a16_gemm` directly to avoid extra Python wrapper overhead.
-        # - We require `quant_scales` already in 1xN contiguous layout (LinearBase provides this).
-        if (
-            self._allspark_available
-            and _vllm_ops is not None
-            and x.dim() == 2
-            and x.device.type == "cuda"
-            and x.dtype == torch.bfloat16
-            and x.is_contiguous()
-            and weight is not None
-            and weight.dtype in (torch.uint8, torch.int8)
-            and weight.is_contiguous()
-            and quant_scales is not None
-            and quant_scales.dim() == 2
-            and quant_scales.is_contiguous()
-            and out_features is not None
-        ):
-            # Minimal shape checks (avoid slow/branchy fallback).
-            m, k = x.shape
-            n_32, k_w = weight.shape
-            if k_w == k and (k & 15) == 0 and 0 < int(out_features) <= int(n_32):
-                sm_count, sm_version = self._get_sm_info(x.device)
-                y = _vllm_ops.allspark_w8a16_gemm(
-                    x,
-                    weight,
-                    quant_scales,
-                    None,  # b_qzeros
-                    int(out_features),
-                    -1,  # group_size (only supports -1)
-                    sm_count,
-                    sm_version,
-                    self._cublas_m_thr,
-                    False,  # has_zp
-                    True,  # n32k16_reorder
-                )
-                if bias is not None:
-                    y = y + bias
-                return y
-
-        # Handle >2D like torch.nn.functional.linear: flatten then reshape back.
-        with torch.profiler.record_function("w8a16/reshape_input"):
-            orig_shape = x.shape
-            if x.dim() == 1:
-                x2 = x.unsqueeze(0)
-            elif x.dim() == 2:
-                x2 = x
-            else:
-                x2 = x.reshape(-1, x.shape[-1])
-
-        # Load-time quantized module path: weight is uint8/int8 buffer and scales provided.
-        with torch.profiler.record_function("w8a16/select_qweight_scales"):
-            if weight is not None and weight.dtype in (torch.uint8, torch.int8):
-                if quant_scales is None:
-                    raise ValueError("quant_scales is required when weight is quantized")
-                qweight = weight
-                scales = quant_scales
-            else:
-                # Lazy cache for bf16 weights (not expected in steady-state, but keep for safety).
-                weight_id = id(weight)
-                cached = self._weight_cache.get(weight_id)
-                if cached is None or cached[0].device != x2.device:
-                    qweight, scales = self.quantize_weight_for_kernel(weight, device=x2.device)
-                    self._weight_cache[weight_id] = (qweight, scales)
-                else:
-                    qweight, scales = cached
-
-        # If fused kernel isn't available, fall back to BF16 only if original weight exists;
-        # otherwise fail fast (do NOT dequantize a full matrix, which is memory-prohibitive).
-        if not self._allspark_available:
+        if not self._allspark_available or _vllm_ops is None:
+            # correctness fallback only when bf16 weight exists
             if weight is not None and getattr(weight, "dtype", None) in (torch.float16, torch.bfloat16):
                 return F.linear(x, weight, bias)
             raise RuntimeError(
-                "vLLM AllSpark W8A16 fused kernel is unavailable, and bf16 weight is not present. "
+                "vLLM AllSpark W8A16 fused kernel is unavailable. "
                 "Please ensure vLLM custom ops are installed and loadable (`import vllm._custom_ops`)."
             )
 
-        # AllSpark kernel requires CUDA and contiguous inputs.
-        with torch.profiler.record_function("w8a16/device_dtype_checks"):
-            if x2.device.type != "cuda":
-                return self._fallback(x, weight, qweight, scales, bias)
-
-            if x2.dtype != torch.bfloat16:
-                x2 = x2.to(dtype=torch.bfloat16)
-
-        # Shape checks: x2 [M,K], qweight [N_32align,K]
-        with torch.profiler.record_function("w8a16/shape_checks"):
-            m, k = x2.shape
-            n_32, k_w = qweight.shape
-            if k_w != k:
-                return self._fallback(x, weight, qweight, scales, bias)
-            if k % 16 != 0:
-                return self._fallback(x, weight, qweight, scales, bias)
-
-        # Recover real N from module bias/metadata if available; default to n_32.
-        # In Diffulex, LinearBase stores output_size; but strategy doesn't receive module.
-        # So we infer N from bias if present else from scales length (can be N_32align).
-        with torch.profiler.record_function("w8a16/infer_n_and_sm"):
-            if out_features is not None:
-                n = int(out_features)
-            else:
-                # Backward compatible fallback.
-                n = int(bias.numel()) if bias is not None else int(min(scales.numel(), n_32))
-            if n <= 0 or n > n_32:
-                n = n_32
-
-            sm_count, sm_version = self._get_sm_info(x2.device)
-            cublas_thr = self._cublas_m_thr
-
-        # vLLM allspark expects scales as 1xN (or equivalent contiguous view).
-        # NOTE: reshape/view doesn't allocate; only materialize contiguous copies when needed.
-        with torch.profiler.record_function("w8a16/prepare_contiguous_and_scales"):
-            if not x2.is_contiguous():
-                x2 = x2.contiguous()
-            # qweight/scales are made contiguous at load-time (`LinearBase.set_quantized_weight`)
-            # and by `quantize_weight_for_kernel` return values.
-            if scales.dim() == 2:
-                scales_1xn = scales
-            else:
-                scales_1xn = scales.view(1, -1)
-
-        with torch.profiler.record_function("w8a16/call_fused_gemm"):
-            y2 = _allspark_w8a16_gemm(
-                x2,
-                qweight,
-                scales_1xn,
-                None,  # b_qzeros
-                n,
-                -1,  # group_size (only supports -1)
-                sm_count,
-                sm_version,
-                cublas_thr,
-                False,  # has_zp
-                True,  # n32k16_reorder
-            )
-            if bias is not None:
-                y2 = y2 + bias
-
-        # Reshape back
-        with torch.profiler.record_function("w8a16/reshape_output"):
-            if x.dim() == 1:
-                y = y2.squeeze(0)
-            elif x.dim() == 2:
-                y = y2
+        orig_shape = x.shape
+        x2 = x.reshape(-1, x.shape[-1]) if x.dim() != 2 else x
+        if x2.device.type != "cuda":
+            if weight is not None and getattr(weight, "dtype", None) in (torch.float16, torch.bfloat16):
+                return F.linear(x, weight, bias)
+            raise RuntimeError("AllSpark W8A16 requires CUDA inputs.")
+
+        if x2.dtype != torch.bfloat16:
+            x2 = x2.to(dtype=torch.bfloat16)
+        if not x2.is_contiguous():
+            x2 = x2.contiguous()
+
+        # Load-time quantized module path: weight is uint8/int8 buffer and scales provided.
+        if weight is not None and weight.dtype in (torch.uint8, torch.int8):
+            if quant_scales is None:
+                raise ValueError("quant_scales is required when weight is quantized")
+            qweight = weight
+            scales = quant_scales
+        else:
+            # Safety net for bf16 weights (should be rare in steady-state).
+            weight_id = id(weight)
+            cached = self._weight_cache.get(weight_id)
+            if cached is None or cached[0].device != x2.device:
+                qweight, scales = self.quantize_weight_for_kernel(weight, device=x2.device)
+                self._weight_cache[weight_id] = (qweight, scales)
             else:
-                y = y2.reshape(*orig_shape[:-1], y2.shape[-1])
-            return y
+                qweight, scales = cached
+
+        m, k = x2.shape
+        n_32, k_w = qweight.shape
+        if k_w != k or (k & 15) != 0:
+            if weight is not None and getattr(weight, "dtype", None) in (torch.float16, torch.bfloat16):
+                y = F.linear(x, weight, bias)
+                return y
+            raise RuntimeError(f"AllSpark W8A16 requires K%16==0 and matching K. Got x.K={k}, w.K={k_w}.")
+
+        n = int(out_features) if out_features is not None else (int(bias.numel()) if bias is not None else int(min(scales.numel(), n_32)))
+        n = n_32 if (n <= 0 or n > n_32) else n
+        scales_1xn = scales if scales.dim() == 2 else scales.view(1, -1)
+
+        sm_count, sm_version = self._get_sm_info(x2.device)
+        y2 = _vllm_ops.allspark_w8a16_gemm(
+            x2,
+            qweight,
+            scales_1xn,
+            None,  # b_qzeros
+            n,
+            -1,  # group_size (only supports -1)
+            sm_count,
+            sm_version,
+            self._cublas_m_thr,
+            False,  # has_zp
+            True,  # n32k16_reorder
+        )
+        if bias is not None:
+            y2 = y2 + bias
+        if orig_shape == x2.shape:
+            return y2
+        if x.dim() == 1:
+            return y2.squeeze(0)
+        return y2.reshape(*orig_shape[:-1], y2.shape[-1])
 
     # NOTE: We intentionally do not provide a generic dequantize+F.linear fallback for reordered weights.
     # It materializes a full bf16 matrix and is prone to OOM on large models.
diff --git a/profile/torch_d2f_profiler.py b/profile/torch_d2f_profiler.py
index 7688154..780ef2f 100644
--- a/profile/torch_d2f_profiler.py
+++ b/profile/torch_d2f_profiler.py
@@ -109,6 +109,9 @@ def main() -> None:
     # Engine settings (force single-process profiling by default)
     parser.add_argument("--tensor-parallel-size", type=int, default=1, help="建议保持 1，否则会 spawn 子进程导致采集不到 CUDA")
     parser.add_argument("--data-parallel-size", type=int, default=1)
+    # Distributed comm (avoid port conflicts with other local runs)
+    parser.add_argument("--master-addr", type=str, default="localhost")
+    parser.add_argument("--master-port", type=int, default=2333)
     parser.add_argument("--gpu-memory-utilization", type=float, default=0.30)
     parser.add_argument("--max-model-len", type=int, default=1024)
 
@@ -171,6 +174,8 @@ def main() -> None:
         enforce_eager=True,
         tensor_parallel_size=args.tensor_parallel_size,
         data_parallel_size=args.data_parallel_size,
+        master_addr=args.master_addr,
+        master_port=args.master_port,
         gpu_memory_utilization=args.gpu_memory_utilization,
         max_model_len=args.max_model_len,
         max_num_batched_tokens=max(1024, args.max_model_len),

From 7fba595c2189b196bb9a44ae20a7fe88f90cde72 Mon Sep 17 00:00:00 2001
From: luozixin2 <lzx@diffulex.local>
Date: Mon, 26 Jan 2026 03:37:36 +0000
Subject: [PATCH 08/10] perf: cache linear forward dispatch for CUDA Graph

- Add per-layer ForwardPlan to pre-resolve bf16/quant/offline paths and reduce per-call Python branching.
- Prefer direct torch.ops kernels (GPTQ/AWQ/Marlin) with static args for stable capture.
- Fix D2F static CUDA graph capture/replay metadata (token buckets + cu_seqlens) and add profiler flag.
---
 diffulex/layer/linear.py                      | 931 ++++++++++++++++++
 .../block_diffusion/engine/model_runner.py    |  10 +
 diffulex/strategy/d2f/engine/model_runner.py  | 139 ++-
 .../fast_dllm_v2/engine/model_runner.py       |  10 +
 .../strategies/linear_awq_w4a16.py            |  17 +-
 .../strategies/linear_gptq_w4a16.py           |  12 +-
 profile/torch_d2f_profiler.py                 |   9 +-
 7 files changed, 1107 insertions(+), 21 deletions(-)

diff --git a/diffulex/layer/linear.py b/diffulex/layer/linear.py
index 5cc4b6d..9dbef0e 100755
--- a/diffulex/layer/linear.py
+++ b/diffulex/layer/linear.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
 from typing import Optional
 
 import torch
@@ -13,6 +16,500 @@ def divide(numerator, denominator):
     return numerator // denominator
 
 
+@dataclass
+class _ForwardPlanSig:
+    """Signature for validating cached forward plans.
+
+    We intentionally keep it small and Python-only so it is CUDA-graph friendly
+    (no `.item()` and no device sync).
+    """
+
+    device_type: str
+    device_index: int
+    x_dtype: torch.dtype
+    x_shape: tuple[int, ...]
+    has_bias: bool
+    mode: str  # "bf16" | "quant" | "offline"
+    strategy_name: str
+
+
+class _ForwardPlanBase:
+    sig: _ForwardPlanSig
+
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:  # pragma: no cover
+        raise NotImplementedError
+
+
+class _BF16Plan(_ForwardPlanBase):
+    def __init__(
+        self,
+        *,
+        sig: _ForwardPlanSig,
+        weight: torch.Tensor,
+        bias: Optional[torch.Tensor],
+    ) -> None:
+        self.sig = sig
+        self._weight = weight
+        self._bias = bias
+
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        return F.linear(x, self._weight, self._bias)
+
+
+class _QuantInt8W8A16Plan(_ForwardPlanBase):
+    def __init__(
+        self,
+        *,
+        sig: _ForwardPlanSig,
+        strategy,
+        quant_kind: str,
+        qweight: torch.Tensor,
+        scales_1xn: torch.Tensor,
+        out_features: int,
+        bias: Optional[torch.Tensor],
+    ) -> None:
+        self.sig = sig
+        self._strategy = strategy
+        self._quant_kind = (quant_kind or "other").strip().lower() or "other"
+        self._qweight = qweight
+        self._scales_1xn = scales_1xn
+        self._out_features = int(out_features)
+        self._bias = bias
+
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        return self._strategy.linear_forward(
+            x,
+            self._qweight,
+            self._bias,
+            quant_kind=self._quant_kind,
+            quant_scales=self._scales_1xn,
+            out_features=self._out_features,
+        )
+
+
+class _QuantInt8W8A8Plan(_ForwardPlanBase):
+    def __init__(
+        self,
+        *,
+        sig: _ForwardPlanSig,
+        strategy,
+        quant_kind: str,
+        qweight: torch.Tensor,
+        scales_1xn: torch.Tensor,
+        out_features: int,
+        bias: Optional[torch.Tensor],
+    ) -> None:
+        self.sig = sig
+        self._strategy = strategy
+        self._quant_kind = (quant_kind or "other").strip().lower() or "other"
+        self._qweight = qweight
+        self._scales_1xn = scales_1xn
+        self._out_features = int(out_features)
+        self._bias = bias
+
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        return self._strategy.linear_forward(
+            x,
+            self._qweight,
+            self._bias,
+            quant_kind=self._quant_kind,
+            quant_scales=self._scales_1xn,
+            out_features=self._out_features,
+        )
+
+
+class _QuantGenericPlan(_ForwardPlanBase):
+    def __init__(
+        self,
+        *,
+        sig: _ForwardPlanSig,
+        strategy,
+        quant_kind: str,
+        weight: torch.Tensor,
+        scales: torch.Tensor,
+        bias: Optional[torch.Tensor],
+    ) -> None:
+        self.sig = sig
+        self._strategy = strategy
+        self._quant_kind = (quant_kind or "other").strip().lower() or "other"
+        self._weight = weight
+        self._scales = scales
+        self._bias = bias
+
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        return self._strategy.linear_forward(
+            x,
+            self._weight,
+            self._bias,
+            quant_kind=self._quant_kind,
+            quant_scales=self._scales,
+        )
+
+
+class _OfflineGPTQPlan(_ForwardPlanBase):
+    def __init__(
+        self,
+        *,
+        sig: _ForwardPlanSig,
+        strategy,
+        quant_kind: str,
+        qweight: torch.Tensor,
+        qzeros: torch.Tensor,
+        scales: torch.Tensor,
+        g_idx: torch.Tensor,
+        weight_bits: int,
+        out_features: int,
+        in_features: int,
+        group_size: int,
+        bias: Optional[torch.Tensor],
+    ) -> None:
+        self.sig = sig
+        self._strategy = strategy
+        self._quant_kind = (quant_kind or "other").strip().lower() or "other"
+        self._qweight = qweight
+        self._qzeros = qzeros
+        self._scales = scales
+        self._g_idx = g_idx
+        self._weight_bits = int(weight_bits)
+        self._out_features = int(out_features)
+        self._in_features = int(in_features)
+        self._group_size = int(group_size)
+        self._bias = bias
+
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        return self._strategy.linear_forward(
+            x,
+            None,
+            self._bias,
+            quant_kind=self._quant_kind,
+            gptq_qweight=self._qweight,
+            gptq_qzeros=self._qzeros,
+            gptq_scales=self._scales,
+            gptq_g_idx=self._g_idx,
+            weight_bits=self._weight_bits,
+            use_v2_format=False,
+            out_features=self._out_features,
+            in_features=self._in_features,
+            group_size=self._group_size,
+        )
+
+
+class _OfflineAWQPlan(_ForwardPlanBase):
+    def __init__(
+        self,
+        *,
+        sig: _ForwardPlanSig,
+        strategy,
+        quant_kind: str,
+        qweight: torch.Tensor,
+        qzeros: torch.Tensor,
+        scales: torch.Tensor,
+        pack_factor: int,
+        out_features: int,
+        in_features: int,
+        group_size: int,
+        bias: Optional[torch.Tensor],
+    ) -> None:
+        self.sig = sig
+        self._strategy = strategy
+        self._quant_kind = (quant_kind or "other").strip().lower() or "other"
+        self._qweight = qweight
+        self._qzeros = qzeros
+        self._scales = scales
+        self._pack_factor = int(pack_factor)
+        self._out_features = int(out_features)
+        self._in_features = int(in_features)
+        self._group_size = int(group_size)
+        self._bias = bias
+
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        return self._strategy.linear_forward(
+            x,
+            None,
+            self._bias,
+            quant_kind=self._quant_kind,
+            awq_qweight=self._qweight,
+            awq_qzeros=self._qzeros,
+            awq_scales=self._scales,
+            pack_factor=self._pack_factor,
+            out_features=self._out_features,
+            in_features=self._in_features,
+            group_size=self._group_size,
+        )
+
+
+class _OfflineGPTQMarlinPlan(_ForwardPlanBase):
+    def __init__(
+        self,
+        *,
+        sig: _ForwardPlanSig,
+        strategy,
+        quant_kind: str,
+        qweight: torch.Tensor,
+        scales: torch.Tensor,
+        zp: torch.Tensor,
+        g_idx: torch.Tensor,
+        g_idx_sort_indices: torch.Tensor,
+        workspace: torch.Tensor,
+        in_features: int,
+        out_features: int,
+        group_size: int,
+        weight_bits: int,
+        tp_dim: Optional[int],
+        bias: Optional[torch.Tensor],
+    ) -> None:
+        self.sig = sig
+        self._strategy = strategy
+        self._quant_kind = (quant_kind or "other").strip().lower() or "other"
+        self._qweight = qweight
+        self._scales = scales
+        self._zp = zp
+        self._g_idx = g_idx
+        self._g_idx_sort_indices = g_idx_sort_indices
+        self._workspace = workspace
+        self._in_features = int(in_features)
+        self._out_features = int(out_features)
+        self._group_size = int(group_size)
+        self._weight_bits = int(weight_bits)
+        self._tp_dim = tp_dim
+        self._bias = bias
+
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        return self._strategy.linear_forward(
+            x,
+            None,
+            self._bias,
+            quant_kind=self._quant_kind,
+            qweight=self._qweight,
+            scales=self._scales,
+            zp=self._zp,
+            g_idx=self._g_idx,
+            g_idx_sort_indices=self._g_idx_sort_indices,
+            workspace=self._workspace,
+            in_features=self._in_features,
+            out_features=self._out_features,
+            group_size=self._group_size,
+            weight_bits=self._weight_bits,
+            tp_dim=self._tp_dim,
+        )
+
+
+class _OfflineAWQMarlinPlan(_ForwardPlanBase):
+    def __init__(
+        self,
+        *,
+        sig: _ForwardPlanSig,
+        strategy,
+        quant_kind: str,
+        qweight: torch.Tensor,
+        scales: torch.Tensor,
+        zp: torch.Tensor,
+        workspace: torch.Tensor,
+        in_features: int,
+        out_features: int,
+        group_size: int,
+        tp_dim: Optional[int],
+        bias: Optional[torch.Tensor],
+    ) -> None:
+        self.sig = sig
+        self._strategy = strategy
+        self._quant_kind = (quant_kind or "other").strip().lower() or "other"
+        self._qweight = qweight
+        self._scales = scales
+        self._zp = zp
+        self._workspace = workspace
+        self._in_features = int(in_features)
+        self._out_features = int(out_features)
+        self._group_size = int(group_size)
+        self._tp_dim = tp_dim
+        self._bias = bias
+
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        return self._strategy.linear_forward(
+            x,
+            None,
+            self._bias,
+            quant_kind=self._quant_kind,
+            qweight=self._qweight,
+            scales=self._scales,
+            zp=self._zp,
+            workspace=self._workspace,
+            in_features=self._in_features,
+            out_features=self._out_features,
+            group_size=self._group_size,
+            tp_dim=self._tp_dim,
+        )
+
+
+class _DirectGPTQGemmPlan(_ForwardPlanBase):
+    """Direct GPTQ GEMM plan (bypass Python strategy glue).
+
+    This calls `torch.ops._C.gptq_gemm` directly with pre-resolved static args.
+    """
+
+    def __init__(
+        self,
+        *,
+        sig: _ForwardPlanSig,
+        qweight: torch.Tensor,
+        qzeros: torch.Tensor,
+        scales: torch.Tensor,
+        g_idx: torch.Tensor,
+        weight_bits: int,
+        out_features: int,
+        bias: Optional[torch.Tensor],
+        use_exllama: bool = True,
+        use_v2_format: bool = False,
+        cast_back_to_x_dtype: bool = True,
+    ) -> None:
+        self.sig = sig
+        self._qweight = qweight
+        self._qzeros = qzeros
+        self._scales = scales
+        self._g_idx = g_idx
+        self._weight_bits = int(weight_bits)
+        self._out_features = int(out_features)
+        self._bias = bias
+        self._use_exllama = bool(use_exllama)
+        self._use_v2_format = bool(use_v2_format)
+        self._cast_back = bool(cast_back_to_x_dtype)
+
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        # vLLM GPTQ kernels expect FP16 activations.
+        x_in = x if x.dtype == torch.float16 else x.to(dtype=torch.float16)
+        x2 = x_in.reshape(-1, x_in.shape[-1]) if x_in.dim() != 2 else x_in
+        if not x2.is_contiguous():
+            x2 = x2.contiguous()
+
+        out = torch.ops._C.gptq_gemm(
+            x2,
+            self._qweight,
+            self._qzeros,
+            self._scales,
+            self._g_idx,
+            self._use_exllama,
+            self._use_v2_format,
+            self._weight_bits,
+        )
+        if self._bias is not None:
+            out.add_(self._bias.to(dtype=out.dtype))
+        out = out.reshape(x.shape[:-1] + (self._out_features,))
+        if self._cast_back and out.dtype != x.dtype:
+            return out.to(dtype=x.dtype)
+        return out
+
+
+class _DirectAWQGemmPlan(_ForwardPlanBase):
+    """Direct AWQ GEMM plan (bypass Python strategy glue)."""
+
+    def __init__(
+        self,
+        *,
+        sig: _ForwardPlanSig,
+        awq_gemm,
+        qweight: torch.Tensor,
+        qzeros: torch.Tensor,
+        scales: torch.Tensor,
+        out_features: int,
+        bias: Optional[torch.Tensor],
+        split_k_iters: int = 1,
+        cast_back_to_x_dtype: bool = True,
+    ) -> None:
+        self.sig = sig
+        self._awq_gemm = awq_gemm
+        self._qweight = qweight
+        self._qzeros = qzeros
+        self._scales = scales
+        self._out_features = int(out_features)
+        self._bias = bias
+        self._split_k_iters = int(split_k_iters)
+        self._cast_back = bool(cast_back_to_x_dtype)
+
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        # vLLM AWQ kernels expect FP16 activations.
+        x_in = x if x.dtype == torch.float16 else x.to(dtype=torch.float16)
+        reshaped_x = x_in.reshape(-1, x_in.shape[-1])
+        if not reshaped_x.is_contiguous():
+            reshaped_x = reshaped_x.contiguous()
+
+        out = self._awq_gemm(reshaped_x, self._qweight, self._qzeros, self._scales, self._split_k_iters)
+        if self._bias is not None:
+            out.add_(self._bias.to(dtype=out.dtype))
+        out = out.reshape(x.shape[:-1] + (self._out_features,))
+        if self._cast_back and out.dtype != x.dtype:
+            return out.to(dtype=x.dtype)
+        return out
+
+
+class _DirectMarlinGemmPlan(_ForwardPlanBase):
+    """Direct Marlin GEMM plan (bypass Python strategy glue).
+
+    This calls `torch.ops._C.gptq_marlin_gemm` directly with pre-resolved static args.
+    """
+
+    def __init__(
+        self,
+        *,
+        sig: _ForwardPlanSig,
+        qweight: torch.Tensor,
+        scales: torch.Tensor,
+        zp: torch.Tensor,
+        g_idx: torch.Tensor,
+        g_idx_sort_indices: torch.Tensor,
+        workspace: torch.Tensor,
+        wtype_id: int,
+        n: int,
+        is_k_full: bool,
+        use_atomic_add: bool,
+        marlin_bias: Optional[torch.Tensor],
+        cast_back_to_x_dtype: bool = True,
+    ) -> None:
+        self.sig = sig
+        self._qweight = qweight
+        self._scales = scales
+        self._zp = zp
+        self._g_idx = g_idx
+        self._g_idx_sort_indices = g_idx_sort_indices
+        self._workspace = workspace
+        self._wtype_id = int(wtype_id)
+        self._n = int(n)
+        self._is_k_full = bool(is_k_full)
+        self._use_atomic_add = bool(use_atomic_add)
+        self._bias = marlin_bias
+        self._cast_back = bool(cast_back_to_x_dtype)
+
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        reshaped_x = x.reshape(-1, x.shape[-1])
+        out_shape = x.shape[:-1] + (int(self._n),)
+        m = int(reshaped_x.shape[0])
+        k = int(reshaped_x.shape[1])
+        out = torch.ops._C.gptq_marlin_gemm(
+            reshaped_x,
+            None,
+            self._qweight,
+            self._bias,
+            self._scales,
+            None,
+            None,
+            self._zp,
+            self._g_idx,
+            self._g_idx_sort_indices,
+            self._workspace,
+            self._wtype_id,
+            m,
+            int(self._n),
+            k,
+            self._is_k_full,
+            self._use_atomic_add,
+            True,  # use_fp32_reduce
+            False,  # is_zp_float
+        )
+        out = out.reshape(out_shape)
+        if self._cast_back and out.dtype != x.dtype:
+            return out.to(dtype=x.dtype)
+        return out
+
+
 class LoRAMixin:
     """Mixin class to add LoRA support to existing linear layers."""
     def __init_lora__(self, r: int = 0, lora_alpha: float = 1.0, lora_dropout: float = 0.0):
@@ -147,6 +644,404 @@ def __init__(
         self._gptq_marlin_is_prepared_py: bool = False
         self._awq_marlin_is_prepared_py: bool = False
 
+        # ---- Forward plan cache (for static/graph-friendly dispatch) ----
+        # When enabled, we build a per-layer callable plan that fixes the runtime
+        # dispatch decisions (bf16 vs quant vs offline, and which concrete kernel path).
+        # This removes heavy Python branching from the hot path and makes CUDA graph
+        # capture more stable.
+        self._forward_plan_enabled: bool = False
+        self._forward_plan: Optional[_ForwardPlanBase] = None
+
+    def _invalidate_forward_plan(self) -> None:
+        self._forward_plan = None
+
+    @staticmethod
+    def _device_index(device: torch.device) -> int:
+        if device.type == "cuda" and device.index is not None:
+            return int(device.index)
+        return -1
+
+    def enable_forward_plan(self, enabled: bool = True) -> None:
+        """Enable/disable cached forward plan dispatch for this layer."""
+        self._forward_plan_enabled = bool(enabled)
+        if not self._forward_plan_enabled:
+            self._invalidate_forward_plan()
+
+    def build_forward_plan_for_static(self, example_x: torch.Tensor, bias: Optional[torch.Tensor]) -> None:
+        """Build a cached forward plan for a fixed static decode-step shape.
+
+        This should be called during warmup/capture. After building, `_forward_base`
+        can execute with minimal Python overhead by invoking the cached plan.
+        """
+        strategy = self._get_linear_strategy()
+        # Ensure we don't keep bf16 and quant weights both resident.
+        self._maybe_promote_weight_to_quantized_at_runtime(example_x, strategy)
+
+        device = example_x.device
+        dev_idx = self._device_index(device)
+        has_bias = bias is not None
+        strategy_name = getattr(strategy, "name", "") if strategy is not None else ""
+
+        # Offline quantized weights have highest priority.
+        if self.has_offline_quantized_weight():
+            if strategy is None:
+                raise RuntimeError("Offline quantized weight is present but no linear strategy is configured.")
+            weight_format = getattr(strategy, "linear_weight_format", None)
+            out_features, in_features, group_size = self._offline_meta()
+            sig = _ForwardPlanSig(
+                device_type=device.type,
+                device_index=dev_idx,
+                x_dtype=example_x.dtype,
+                x_shape=tuple(int(x) for x in example_x.shape),
+                has_bias=has_bias,
+                mode="offline",
+                strategy_name=strategy_name,
+            )
+
+            if weight_format == "gptq":
+                self._maybe_prepare_offline_gptq(example_x)
+                bits = self._infer_gptq_weight_bits(in_features=in_features)
+                # Use already-correct g_idx buffer (can be empty), moved once to the example device.
+                g_idx = self.gptq_g_idx
+                if g_idx.device != device:
+                    g_idx = g_idx.to(device=device, dtype=torch.int)
+
+                # Prefer direct torch.ops entry point to bypass Python strategy glue.
+                if hasattr(torch.ops, "_C") and hasattr(torch.ops._C, "gptq_gemm"):
+                    self._forward_plan = _DirectGPTQGemmPlan(
+                        sig=sig,
+                        qweight=self.gptq_qweight,
+                        qzeros=self.gptq_qzeros,
+                        scales=self.gptq_scales,
+                        g_idx=g_idx,
+                        weight_bits=bits,
+                        out_features=out_features,
+                        bias=bias,
+                        use_exllama=True,
+                        use_v2_format=False,
+                        cast_back_to_x_dtype=True,
+                    )
+                else:
+                    self._forward_plan = _OfflineGPTQPlan(
+                        sig=sig,
+                        strategy=strategy,
+                        quant_kind=self.quant_kind,
+                        qweight=self.gptq_qweight,
+                        qzeros=self.gptq_qzeros,
+                        scales=self.gptq_scales,
+                        g_idx=g_idx,
+                        weight_bits=bits,
+                        out_features=out_features,
+                        in_features=in_features,
+                        group_size=group_size,
+                        bias=bias,
+                    )
+                return
+
+            if weight_format == "awq":
+                bits = int(self._offline_quant_bits_py) if int(self._offline_quant_bits_py) > 0 else 4
+                pack_factor = 32 // max(1, bits)
+                # Prefer direct torch.ops entry point to bypass Python strategy glue.
+                awq_gemm = None
+                try:
+                    if hasattr(torch.ops, "_C") and hasattr(torch.ops._C, "awq_gemm"):
+                        awq_gemm = torch.ops._C.awq_gemm
+                except Exception:
+                    awq_gemm = None
+
+                if awq_gemm is not None:
+                    self._forward_plan = _DirectAWQGemmPlan(
+                        sig=sig,
+                        awq_gemm=awq_gemm,
+                        qweight=self.awq_qweight,
+                        qzeros=self.awq_qzeros,
+                        scales=self.awq_scales,
+                        out_features=out_features,
+                        bias=bias,
+                        split_k_iters=1,
+                        cast_back_to_x_dtype=True,
+                    )
+                else:
+                    self._forward_plan = _OfflineAWQPlan(
+                        sig=sig,
+                        strategy=strategy,
+                        quant_kind=self.quant_kind,
+                        qweight=self.awq_qweight,
+                        qzeros=self.awq_qzeros,
+                        scales=self.awq_scales,
+                        pack_factor=pack_factor,
+                        out_features=out_features,
+                        in_features=in_features,
+                        group_size=group_size,
+                        bias=bias,
+                    )
+                return
+
+            if weight_format == "gptq_marlin":
+                self._maybe_prepare_offline_gptq_marlin(example_x)
+                bits = self._infer_gptq_weight_bits(in_features=in_features)
+                # Prefer direct torch.ops entry point to bypass Python strategy glue.
+                if hasattr(torch.ops, "_C") and hasattr(torch.ops._C, "gptq_marlin_gemm"):
+                    try:
+                        from vllm.model_executor.layers.quantization.utils.marlin_utils import (  # type: ignore
+                            marlin_is_k_full,
+                            marlin_make_empty_g_idx,
+                            should_use_atomic_add_reduce,
+                            marlin_permute_bias,
+                        )
+                        from vllm.scalar_type import scalar_types  # type: ignore
+                    except Exception:
+                        marlin_is_k_full = None  # type: ignore
+                        marlin_make_empty_g_idx = None  # type: ignore
+                        should_use_atomic_add_reduce = None  # type: ignore
+                        marlin_permute_bias = None  # type: ignore
+                        scalar_types = None  # type: ignore
+
+                    if scalar_types is None:
+                        # Fall back to the strategy path if vLLM marlin utils are unavailable.
+                        self._forward_plan = _OfflineGPTQMarlinPlan(
+                            sig=sig,
+                            strategy=strategy,
+                            quant_kind=self.quant_kind,
+                            qweight=self.gptq_marlin_qweight,
+                            scales=self.gptq_marlin_scales,
+                            zp=self.gptq_marlin_zp,
+                            g_idx=self.gptq_marlin_g_idx,
+                            g_idx_sort_indices=self.gptq_marlin_g_idx_sort_indices,
+                            workspace=self.gptq_marlin_workspace,
+                            in_features=in_features,
+                            out_features=out_features,
+                            group_size=group_size,
+                            weight_bits=bits,
+                            tp_dim=self.tp_dim,
+                            bias=bias,
+                        )
+                        return
+
+                    device = example_x.device
+                    dev_key = self._device_index(device)
+                    # Prefer already prepared tensors; if missing, use cached empties.
+                    def _empty() -> torch.Tensor:
+                        if marlin_make_empty_g_idx is not None:
+                            return marlin_make_empty_g_idx(device)
+                        return torch.empty((0,), device=device, dtype=torch.int32)
+
+                    g_idx = self.gptq_marlin_g_idx if self.gptq_marlin_g_idx.numel() > 0 else _empty()
+                    g_idx_sort = (
+                        self.gptq_marlin_g_idx_sort_indices
+                        if self.gptq_marlin_g_idx_sort_indices.numel() > 0
+                        else _empty()
+                    )
+                    row_parallel = bool(self.tp_dim == 1)
+                    has_g_idx = bool(g_idx.numel() > 0)
+                    is_k_full = True if marlin_is_k_full is None else marlin_is_k_full(has_g_idx, row_parallel)
+
+                    marlin_bias = None
+                    if bias is not None:
+                        marlin_bias = marlin_permute_bias(bias) if marlin_permute_bias is not None else bias
+
+                    reshaped_x = example_x.reshape(-1, example_x.shape[-1])
+                    m = int(reshaped_x.shape[0])
+                    n = int(out_features)
+                    k = int(reshaped_x.shape[1])
+                    use_atomic_add = False
+                    if should_use_atomic_add_reduce is not None:
+                        use_atomic_add = bool(
+                            should_use_atomic_add_reduce(m=m, n=n, k=k, device=device, dtype=reshaped_x.dtype)
+                        )
+
+                    if bits == 4:
+                        wtype = scalar_types.uint4b8
+                    elif bits == 8:
+                        wtype = scalar_types.uint8b128
+                    else:
+                        raise RuntimeError(f"gptq_marlin: unsupported weight_bits={bits} (expected 4 or 8)")
+
+                    self._forward_plan = _DirectMarlinGemmPlan(
+                        sig=sig,
+                        qweight=self.gptq_marlin_qweight,
+                        scales=self.gptq_marlin_scales,
+                        zp=self.gptq_marlin_zp,
+                        g_idx=g_idx,
+                        g_idx_sort_indices=g_idx_sort,
+                        workspace=self.gptq_marlin_workspace,
+                        wtype_id=wtype.id,
+                        n=out_features,
+                        is_k_full=is_k_full,
+                        use_atomic_add=use_atomic_add,
+                        marlin_bias=marlin_bias,
+                        cast_back_to_x_dtype=True,
+                    )
+                else:
+                    self._forward_plan = _OfflineGPTQMarlinPlan(
+                        sig=sig,
+                        strategy=strategy,
+                        quant_kind=self.quant_kind,
+                        qweight=self.gptq_marlin_qweight,
+                        scales=self.gptq_marlin_scales,
+                        zp=self.gptq_marlin_zp,
+                        g_idx=self.gptq_marlin_g_idx,
+                        g_idx_sort_indices=self.gptq_marlin_g_idx_sort_indices,
+                        workspace=self.gptq_marlin_workspace,
+                        in_features=in_features,
+                        out_features=out_features,
+                        group_size=group_size,
+                        weight_bits=bits,
+                        tp_dim=self.tp_dim,
+                        bias=bias,
+                    )
+                return
+
+            if weight_format == "awq_marlin":
+                self._maybe_prepare_offline_awq_marlin(example_x)
+                if hasattr(torch.ops, "_C") and hasattr(torch.ops._C, "gptq_marlin_gemm"):
+                    try:
+                        from vllm.model_executor.layers.quantization.utils.marlin_utils import (  # type: ignore
+                            marlin_make_empty_g_idx,
+                            should_use_atomic_add_reduce,
+                            marlin_permute_bias,
+                        )
+                        from vllm.scalar_type import scalar_types  # type: ignore
+                    except Exception:
+                        marlin_make_empty_g_idx = None  # type: ignore
+                        should_use_atomic_add_reduce = None  # type: ignore
+                        marlin_permute_bias = None  # type: ignore
+                        scalar_types = None  # type: ignore
+
+                    if scalar_types is None:
+                        self._forward_plan = _OfflineAWQMarlinPlan(
+                            sig=sig,
+                            strategy=strategy,
+                            quant_kind=self.quant_kind,
+                            qweight=self.awq_marlin_qweight,
+                            scales=self.awq_marlin_scales,
+                            zp=self.awq_marlin_zp,
+                            workspace=self.awq_marlin_workspace,
+                            in_features=in_features,
+                            out_features=out_features,
+                            group_size=group_size,
+                            tp_dim=self.tp_dim,
+                            bias=bias,
+                        )
+                        return
+
+                    device = example_x.device
+                    empty = (
+                        marlin_make_empty_g_idx(device)
+                        if marlin_make_empty_g_idx is not None
+                        else torch.empty((0,), device=device, dtype=torch.int32)
+                    )
+                    marlin_bias = None
+                    if bias is not None:
+                        marlin_bias = marlin_permute_bias(bias) if marlin_permute_bias is not None else bias
+
+                    reshaped_x = example_x.reshape(-1, example_x.shape[-1])
+                    m = int(reshaped_x.shape[0])
+                    n = int(out_features)
+                    k = int(reshaped_x.shape[1])
+                    use_atomic_add = False
+                    if should_use_atomic_add_reduce is not None:
+                        use_atomic_add = bool(
+                            should_use_atomic_add_reduce(m=m, n=n, k=k, device=device, dtype=reshaped_x.dtype)
+                        )
+
+                    self._forward_plan = _DirectMarlinGemmPlan(
+                        sig=sig,
+                        qweight=self.awq_marlin_qweight,
+                        scales=self.awq_marlin_scales,
+                        zp=self.awq_marlin_zp,
+                        g_idx=empty,
+                        g_idx_sort_indices=empty,
+                        workspace=self.awq_marlin_workspace,
+                        wtype_id=scalar_types.uint4.id,
+                        n=out_features,
+                        is_k_full=True,
+                        use_atomic_add=use_atomic_add,
+                        marlin_bias=marlin_bias,
+                        cast_back_to_x_dtype=True,
+                    )
+                else:
+                    self._forward_plan = _OfflineAWQMarlinPlan(
+                        sig=sig,
+                        strategy=strategy,
+                        quant_kind=self.quant_kind,
+                        qweight=self.awq_marlin_qweight,
+                        scales=self.awq_marlin_scales,
+                        zp=self.awq_marlin_zp,
+                        workspace=self.awq_marlin_workspace,
+                        in_features=in_features,
+                        out_features=out_features,
+                        group_size=group_size,
+                        tp_dim=self.tp_dim,
+                        bias=bias,
+                    )
+                return
+
+            # If a new offline strategy is added, fall back to the generic runtime dispatcher.
+            raise RuntimeError(
+                f"Offline quantized weight is present but strategy weight_format={weight_format!r} is not supported by forward plan."
+            )
+
+        # Online/load-time quantized weights.
+        if self.has_quantized_weight():
+            if strategy is None:
+                raise RuntimeError("Quantized weight is present but no linear strategy is configured.")
+            sig = _ForwardPlanSig(
+                device_type=device.type,
+                device_index=dev_idx,
+                x_dtype=example_x.dtype,
+                x_shape=tuple(int(x) for x in example_x.shape),
+                has_bias=has_bias,
+                mode="quant",
+                strategy_name=strategy_name,
+            )
+            if getattr(strategy, "name", "") == "linear_int8_w8a16":
+                self._forward_plan = _QuantInt8W8A16Plan(
+                    sig=sig,
+                    strategy=strategy,
+                    quant_kind=self.quant_kind,
+                    qweight=self.quant_weight_int8,
+                    scales_1xn=self.quant_scales_1xn,
+                    out_features=self._forward_out_features,
+                    bias=bias,
+                )
+                return
+            if getattr(strategy, "name", "") == "linear_int8_w8a8":
+                self._forward_plan = _QuantInt8W8A8Plan(
+                    sig=sig,
+                    strategy=strategy,
+                    quant_kind=self.quant_kind,
+                    qweight=self.quant_weight_int8,
+                    scales_1xn=self.quant_scales_1xn,
+                    out_features=self._forward_out_features,
+                    bias=bias,
+                )
+                return
+            self._forward_plan = _QuantGenericPlan(
+                sig=sig,
+                strategy=strategy,
+                quant_kind=self.quant_kind,
+                weight=self.quant_weight_int8,
+                scales=self.quant_scales,
+                bias=bias,
+            )
+            return
+
+        # BF16 weights (no quant).
+        weight = getattr(self, "weight", None)
+        if weight is None:
+            raise RuntimeError("No quantized/offline weights are present but bf16 weight is missing.")
+        sig = _ForwardPlanSig(
+            device_type=device.type,
+            device_index=dev_idx,
+            x_dtype=example_x.dtype,
+            x_shape=tuple(int(x) for x in example_x.shape),
+            has_bias=has_bias,
+            mode="bf16",
+            strategy_name=strategy_name,
+        )
+        self._forward_plan = _BF16Plan(sig=sig, weight=weight, bias=bias)
+
     def has_quantized_weight(self) -> bool:
         return self._weight_is_quantized_py and self.quant_weight_int8.numel() > 0 and self.quant_scales.numel() > 0
 
@@ -364,6 +1259,9 @@ def _infer_module_device() -> torch.device:
             self._parameters.pop("weight", None)
             setattr(self, "weight", None)
 
+        # Offline weights changed; cached forward plan is no longer valid.
+        self._invalidate_forward_plan()
+
     def _maybe_prepare_offline_gptq(self, x: torch.Tensor) -> None:
         """Prepare vLLM GPTQ weights on first use (required gptq_shuffle)."""
         if self._offline_quant_format_py != 1:
@@ -669,6 +1567,8 @@ def set_quantized_weight(self, quant_weight_int8: torch.Tensor, quant_scales: to
         self.quant_scales_1xn = quant_scales if quant_scales.dim() == 2 else quant_scales.view(1, -1)
         self._weight_is_quantized.fill_(True)
         self._weight_is_quantized_py = True
+        # Quant buffers changed; cached forward plan is no longer valid.
+        self._invalidate_forward_plan()
 
     def _maybe_promote_weight_to_quantized_at_runtime(
         self,
@@ -879,6 +1779,37 @@ def _build_offline_forward_kwargs(self, x: torch.Tensor, strategy) -> dict:
 
     def _forward_base(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor:
         """Unified forward dispatcher for bf16 / online quant / offline GPTQ/AWQ."""
+        if getattr(self, "_forward_plan_enabled", False):
+            plan = getattr(self, "_forward_plan", None)
+            if plan is None:
+                self.build_forward_plan_for_static(x, bias)
+                plan = getattr(self, "_forward_plan", None)
+            if plan is not None:
+                sig = plan.sig
+                dev = x.device
+                dev_idx = self._device_index(dev)
+                if (
+                    sig.device_type == dev.type
+                    and sig.device_index == dev_idx
+                    and sig.x_dtype == x.dtype
+                    and sig.x_shape == tuple(int(v) for v in x.shape)
+                    and sig.has_bias == (bias is not None)
+                ):
+                    return plan(x)
+                # Static mode but shape/dtype changed: rebuild once and retry.
+                self.build_forward_plan_for_static(x, bias)
+                plan = getattr(self, "_forward_plan", None)
+                if plan is not None:
+                    sig = plan.sig
+                    if (
+                        sig.device_type == dev.type
+                        and sig.device_index == dev_idx
+                        and sig.x_dtype == x.dtype
+                        and sig.x_shape == tuple(int(v) for v in x.shape)
+                        and sig.has_bias == (bias is not None)
+                    ):
+                        return plan(x)
+
         strategy = self._get_linear_strategy()
         # Runtime safety net: ensure we don't keep bf16+quant weights both resident.
         self._maybe_promote_weight_to_quantized_at_runtime(x, strategy)
diff --git a/diffulex/strategy/block_diffusion/engine/model_runner.py b/diffulex/strategy/block_diffusion/engine/model_runner.py
index cc53221..61a4f99 100644
--- a/diffulex/strategy/block_diffusion/engine/model_runner.py
+++ b/diffulex/strategy/block_diffusion/engine/model_runner.py
@@ -187,6 +187,16 @@ def run(self, seqs: list[SequenceBase], is_prefill: bool) -> list[int]:
 
     @torch.inference_mode()
     def capture_cudagraph(self):
+        # Enable per-layer forward-plan dispatch to stabilize capture and minimize
+        # Python branching inside the captured region.
+        try:
+            from diffulex.layer.linear import LinearBase
+            for m in self.model.modules():
+                if isinstance(m, LinearBase):
+                    m.enable_forward_plan(True)
+        except Exception:
+            pass
+
         set_warming_up(True)
         config = self.config
         hf_config = config.hf_config
diff --git a/diffulex/strategy/d2f/engine/model_runner.py b/diffulex/strategy/d2f/engine/model_runner.py
index c06fbcd..839c848 100644
--- a/diffulex/strategy/d2f/engine/model_runner.py
+++ b/diffulex/strategy/d2f/engine/model_runner.py
@@ -292,20 +292,34 @@ def get_step(diff_blk, begin_idx):
     def run_model(self, input_ids: torch.Tensor, positions: torch.Tensor, is_prefill: bool):
         if is_prefill or self.enforce_eager or input_ids.size(0) > 512:
             return self.model.compute_logits(self.model(input_ids, positions))
-        bs = input_ids.size(0)
+        num_tokens = input_ids.size(0)
         context = fetch_d2f_attn_metadata()
-        graph = self.graphs[next(x for x in self.graph_bs if x >= bs)]
+        bucket_tokens = next(x for x in self.graph_bs if x >= num_tokens)
+        graph = self.graphs[bucket_tokens]
         graph_vars = self.graph_vars
         for key, value in graph_vars.items():
             if key != "outputs":
                 value.zero_()
-        graph_vars["input_ids"][:bs] = input_ids
-        graph_vars["positions"][:bs] = positions
-        graph_vars["slot_mapping"][:bs] = context.slot_mapping
-        graph_vars["context_lens"][:bs] = context.context_lens
-        graph_vars["block_tables"][:bs, : context.block_tables.size(1)] = context.block_tables
+        graph_vars["input_ids"][:num_tokens] = input_ids
+        graph_vars["positions"][:num_tokens] = positions
+        graph_vars["slot_mapping"][:num_tokens] = context.slot_mapping
+        num_seqs = int(context.context_lens.numel())
+        graph_vars["context_lens"][:num_seqs] = context.context_lens
+        # cu_seqlens are required by unified paged-attn decode kernels.
+        if getattr(context, "cu_seqlens_q", None) is not None:
+            graph_vars["cu_seqlens_q"][: num_seqs + 1] = context.cu_seqlens_q
+            bucket_num_seqs = int(bucket_tokens // max(1, int(self.diffusion_block_size)))
+            if bucket_num_seqs > num_seqs:
+                graph_vars["cu_seqlens_q"][num_seqs + 1 : bucket_num_seqs + 1].fill_(int(num_tokens))
+        if getattr(context, "cu_seqlens_k", None) is not None:
+            graph_vars["cu_seqlens_k"][: num_seqs + 1] = context.cu_seqlens_k
+            bucket_num_seqs = int(bucket_tokens // max(1, int(self.diffusion_block_size)))
+            if bucket_num_seqs > num_seqs:
+                last_k = context.cu_seqlens_k[num_seqs]
+                graph_vars["cu_seqlens_k"][num_seqs + 1 : bucket_num_seqs + 1] = last_k
+        graph_vars["block_tables"][:num_seqs, : context.block_tables.size(1)] = context.block_tables
         graph.replay()
-        return self.model.compute_logits(graph_vars["outputs"][:bs])
+        return self.model.compute_logits(graph_vars["outputs"][:num_tokens])
 
     def run(self, seqs: list[SequenceBase], is_prefill: bool) -> list[int]:
         input_ids, positions = self.prepare_prefill(seqs) if is_prefill else self.prepare_decode(seqs)
@@ -317,8 +331,107 @@ def run(self, seqs: list[SequenceBase], is_prefill: bool) -> list[int]:
 
     @torch.inference_mode()
     def capture_cudagraph(self):
-        """
-        TODO: Varlen decoding does not support CUDA graph capture yet.
-        Can be implemented, but requires drastically high overhead.
-        """
-        raise NotImplementedError("CUDA graph capture for DiffusionLM is not implemented yet.")
+        # Static-mode CUDA graph capture for D2F decode.
+        #
+        # NOTE:
+        # - This matches `run_model()`'s replay protocol: we only overwrite
+        #   input_ids/positions/slot_mapping/context_lens/block_tables per step.
+        # - Varlen mode is intentionally not supported here (assume static flow).
+        from tqdm import tqdm
+
+        # Enable per-layer forward-plan dispatch to stabilize capture and minimize
+        # Python branching inside the captured region.
+        try:
+            from diffulex.layer.linear import LinearBase
+            for m in self.model.modules():
+                if isinstance(m, LinearBase):
+                    m.enable_forward_plan(True)
+        except Exception:
+            pass
+
+        set_warming_up(True)
+        config = self.config
+        hf_config = config.hf_config
+        diffusion_block_size = int(self.diffusion_block_size)
+        max_num_seqs = int(self.config.max_num_seqs)
+        # Graph path is only used when num_tokens <= 512.
+        max_num_seqs_for_graph = max(1, min(max_num_seqs, 512 // max(1, diffusion_block_size)))
+        max_num_tokens = max_num_seqs_for_graph * diffusion_block_size
+        max_num_blocks = (config.max_model_len + self.block_size - 1) // self.block_size
+
+        # Allocate graph buffers on the same device/dtype as the model.
+        try:
+            p0 = next(self.model.parameters())
+            graph_device = p0.device
+            graph_dtype = p0.dtype
+        except StopIteration:
+            graph_device = torch.device("cuda")
+            graph_dtype = torch.float16
+
+        # Allocate max-size graph buffers.
+        input_ids = torch.zeros(max_num_tokens, dtype=torch.int64, device=graph_device)
+        positions = torch.zeros(max_num_tokens, dtype=torch.int64, device=graph_device)
+        slot_mapping = torch.zeros(max_num_tokens, dtype=torch.int32, device=graph_device)
+        context_lens = torch.zeros(max_num_seqs_for_graph, dtype=torch.int32, device=graph_device)
+        block_tables = torch.zeros(max_num_seqs_for_graph, max_num_blocks, dtype=torch.int32, device=graph_device)
+        outputs = torch.zeros(max_num_tokens, hf_config.hidden_size, dtype=graph_dtype, device=graph_device)
+        cu_seqlens_q = torch.zeros(max_num_seqs_for_graph + 1, dtype=torch.int32, device=graph_device)
+        cu_seqlens_k = torch.zeros(max_num_seqs_for_graph + 1, dtype=torch.int32, device=graph_device)
+
+        # Capture bucketed graphs by num_tokens (bucketed by num_seqs * diffusion_block_size).
+        self.graph_bs = []
+        seq_bs_list = [1, 2, 4, 8] + list(range(16, max_num_seqs_for_graph + 1, 16))
+        for num_seqs in sorted(set([b for b in seq_bs_list if b <= max_num_seqs_for_graph] + [max_num_seqs_for_graph])):
+            self.graph_bs.append(int(num_seqs) * diffusion_block_size)
+        self.graphs = {}
+        self.graph_pool = None
+
+        for num_tokens in tqdm(reversed(self.graph_bs), desc="Capturing CUDA graphs"):
+            num_seqs = int(num_tokens // diffusion_block_size)
+            graph = torch.cuda.CUDAGraph()
+            # Fill placeholder metadata with valid monotonic cu_seqlens to satisfy kernel assertions.
+            cu_seqlens_q[: num_seqs + 1] = (
+                torch.arange(num_seqs + 1, dtype=torch.int32, device=graph_device) * diffusion_block_size
+            )
+            # Use a conservative max-seqlen for K to keep shapes stable; values are overwritten before replay.
+            cu_seqlens_k[: num_seqs + 1] = (
+                torch.arange(num_seqs + 1, dtype=torch.int32, device=graph_device) * int(config.max_model_len)
+            )
+            context_lens[:num_seqs].fill_(int(config.max_model_len))
+            # For static decode, use placeholder metadata tensors; per-step values are copied
+            # into `graph_vars` before replay.
+            set_d2f_attn_metadata(
+                False,
+                slot_mapping=slot_mapping[:num_tokens],
+                context_lens=context_lens[:num_seqs],
+                cu_seqlens_q=cu_seqlens_q[: num_seqs + 1],
+                cu_seqlens_k=cu_seqlens_k[: num_seqs + 1],
+                max_seqlen_q=diffusion_block_size,
+                max_seqlen_k=int(config.max_model_len),
+                block_tables=block_tables[:num_seqs],
+                kv_cache_layout=self.config.kv_cache_layout,
+                need_kv_cache_store=True,
+                diffusion_block_size=self.diffusion_block_size,
+                decode_mode="static",
+                attn_type="full_attention",
+            )
+            outputs[:num_tokens] = self.model(input_ids[:num_tokens], positions[:num_tokens])  # warmup
+            with torch.cuda.graph(graph, self.graph_pool):
+                outputs[:num_tokens] = self.model(input_ids[:num_tokens], positions[:num_tokens])  # capture
+            if self.graph_pool is None:
+                self.graph_pool = graph.pool()
+            self.graphs[num_tokens] = graph
+            torch.cuda.synchronize()
+            reset_d2f_attn_metadata()
+
+        self.graph_vars = dict(
+            input_ids=input_ids,
+            positions=positions,
+            slot_mapping=slot_mapping,
+            context_lens=context_lens,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            block_tables=block_tables,
+            outputs=outputs,
+        )
+        reset_warming_up()
diff --git a/diffulex/strategy/fast_dllm_v2/engine/model_runner.py b/diffulex/strategy/fast_dllm_v2/engine/model_runner.py
index f265c92..1f5f6c4 100644
--- a/diffulex/strategy/fast_dllm_v2/engine/model_runner.py
+++ b/diffulex/strategy/fast_dllm_v2/engine/model_runner.py
@@ -187,6 +187,16 @@ def run(self, seqs: list[SequenceBase], is_prefill: bool) -> list[int]:
 
     @torch.inference_mode()
     def capture_cudagraph(self):
+        # Enable per-layer forward-plan dispatch to stabilize capture and minimize
+        # Python branching inside the captured region.
+        try:
+            from diffulex.layer.linear import LinearBase
+            for m in self.model.modules():
+                if isinstance(m, LinearBase):
+                    m.enable_forward_plan(True)
+        except Exception:
+            pass
+
         set_warming_up(True)
         config = self.config
         hf_config = config.hf_config
diff --git a/diffulex/utils/quantization/strategies/linear_awq_w4a16.py b/diffulex/utils/quantization/strategies/linear_awq_w4a16.py
index 22295fa..7090d59 100644
--- a/diffulex/utils/quantization/strategies/linear_awq_w4a16.py
+++ b/diffulex/utils/quantization/strategies/linear_awq_w4a16.py
@@ -32,7 +32,17 @@ def _build_linear_awq_w4a16() -> LinearQuantizationStrategy:
 class LinearAWQW4A16Strategy(LinearQuantizationStrategy):
     def __init__(self) -> None:
         super().__init__()
-        self._ops_available: bool = bool(ops is not None and hasattr(torch.ops, "_C") and hasattr(torch.ops._C, "awq_gemm"))
+        # Resolve the concrete kernel entry point once (avoid per-call dispatch).
+        awq_gemm = None
+        try:
+            if hasattr(torch.ops, "_C") and hasattr(torch.ops._C, "awq_gemm"):
+                awq_gemm = torch.ops._C.awq_gemm
+        except Exception:
+            awq_gemm = None
+        if awq_gemm is None and ops is not None and hasattr(ops, "awq_gemm"):
+            awq_gemm = ops.awq_gemm
+        self._awq_gemm = awq_gemm
+        self._ops_available: bool = bool(self._awq_gemm is not None)
 
     @property
     def name(self) -> str:
@@ -114,10 +124,7 @@ def linear_forward(
         # Always use awq_gemm to avoid large temporary dequantized weight allocations.
         # vLLM API: awq_gemm(input, qweight, qzeros, scales, split_k_iters)
         split_k_iters = 1
-        if reshaped_x.is_contiguous() and qweight.is_contiguous() and qzeros.is_contiguous() and scales.is_contiguous():
-            out = torch.ops._C.awq_gemm(reshaped_x, qweight, qzeros, scales, split_k_iters)
-        else:
-            out = ops.awq_gemm(reshaped_x, qweight, qzeros, scales, split_k_iters)
+        out = self._awq_gemm(reshaped_x, qweight, qzeros, scales, split_k_iters)  # type: ignore[misc]
 
         if bias is not None:
             out.add_(bias.to(dtype=out.dtype))
diff --git a/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py b/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py
index 95e5b9e..7adfd10 100644
--- a/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py
+++ b/diffulex/utils/quantization/strategies/linear_gptq_w4a16.py
@@ -37,6 +37,8 @@ class LinearGPTQW4A16Strategy(LinearQuantizationStrategy):
     def __init__(self) -> None:
         super().__init__()
         self._ops_available: bool = bool(ops is not None and hasattr(torch.ops, "_C") and hasattr(torch.ops._C, "gptq_gemm"))
+        # Cache empty g_idx tensor per device to avoid per-call allocations.
+        self._empty_cache: dict[int, torch.Tensor] = {}
 
     @property
     def name(self) -> str:
@@ -121,10 +123,16 @@ def linear_forward(
         if not x2.is_contiguous():
             x2 = x2.contiguous()
 
+        device = x.device
+        dev_key = int(device.index) if device.type == "cuda" and device.index is not None else -1
         if g_idx is None or g_idx.numel() == 0:
-            g_idx_t = torch.empty((0,), device=x.device, dtype=torch.int)
+            empty = self._empty_cache.get(dev_key)
+            if empty is None or empty.device != device:
+                empty = torch.empty((0,), device=device, dtype=torch.int)
+                self._empty_cache[dev_key] = empty
+            g_idx_t = empty
         else:
-            g_idx_t = g_idx if (g_idx.device == x.device and g_idx.dtype == torch.int) else g_idx.to(device=x.device, dtype=torch.int)
+            g_idx_t = g_idx if (g_idx.device == device and g_idx.dtype == torch.int) else g_idx.to(device=device, dtype=torch.int)
 
         output = torch.ops._C.gptq_gemm(
             x2,
diff --git a/profile/torch_d2f_profiler.py b/profile/torch_d2f_profiler.py
index 780ef2f..e8d36cb 100644
--- a/profile/torch_d2f_profiler.py
+++ b/profile/torch_d2f_profiler.py
@@ -106,6 +106,13 @@ def main() -> None:
     parser.add_argument("--linear-attn-act-dtype", type=str, default="bf16")
     parser.add_argument("--linear-mlp-act-dtype", type=str, default="bf16")
 
+    # CUDA Graph
+    parser.add_argument(
+        "--use-cudagraph",
+        action="store_true",
+        help="启用 CUDA Graph（仅 decode_mode=static 且 shape 稳定时有意义）；默认关闭以避免 capture 成本影响分析。",
+    )
+
     # Engine settings (force single-process profiling by default)
     parser.add_argument("--tensor-parallel-size", type=int, default=1, help="建议保持 1，否则会 spawn 子进程导致采集不到 CUDA")
     parser.add_argument("--data-parallel-size", type=int, default=1)
@@ -171,7 +178,7 @@ def main() -> None:
         use_lora=use_lora,
         model_name="dream",
         decoding_strategy="d2f",
-        enforce_eager=True,
+        enforce_eager=not args.use_cudagraph,
         tensor_parallel_size=args.tensor_parallel_size,
         data_parallel_size=args.data_parallel_size,
         master_addr=args.master_addr,

From 0d511452ea43738bd8b55e111a514301ae7aac58 Mon Sep 17 00:00:00 2001
From: luozixin2 <lzx@diffulex.local>
Date: Tue, 27 Jan 2026 04:25:22 +0000
Subject: [PATCH 09/10] Fix static+CUDA Graph mode and add benchmark configs

- Fix tensor shape mismatch bug in static+CUDA Graph decode mode (model_runner.py)
  - Improve bucket selection logic for variable token counts
  - Add safety fallback when runtime batch exceeds captured capacity
  - Fix metadata buffer initialization and padding

- Add new static mode benchmark configs:
  - awq_bf16kv_static.yml
  - gptq_marlin_w4_bf16kv_static.yml
  - gptq_marlin_w8_bf16kv_static.yml

- Update quantization strategies and loader utilities
- Update benchmark configurations for consistency
---
 diffulex/config.py                            |  3 +-
 diffulex/layer/linear.py                      |  9 ++-
 diffulex/strategy/d2f/engine/model_runner.py  | 74 ++++++++++++-------
 diffulex/utils/loader.py                      | 16 ++++
 .../strategies/linear_awq_w4a16.py            | 38 ++++++++--
 diffulex_bench/arg_parser.py                  |  7 ++
 diffulex_bench/configs/awq_bf16kv_static.yml  | 47 ++++++++++++
 diffulex_bench/configs/awq_bf16kv_varlen.yml  |  2 +-
 .../configs/awq_marlin_bf16kv_varlen.yml      |  2 +-
 .../configs/bf16_bf16kv_distinct.yml          |  2 +-
 diffulex_bench/configs/bf16_bf16kv_static.yml |  2 +-
 diffulex_bench/configs/bf16_bf16kv_varlen.yml |  2 +-
 .../configs/bf16_fp8kv_distinct.yml           |  2 +-
 diffulex_bench/configs/bf16_fp8kv_static.yml  |  2 +-
 diffulex_bench/configs/bf16_fp8kv_varlen.yml  |  2 +-
 diffulex_bench/configs/dream_d2f_gsm8k.yml    |  2 +-
 diffulex_bench/configs/example.yml            |  2 +-
 diffulex_bench/configs/fp8_bf16kv_varlen.yml  |  2 +-
 diffulex_bench/configs/gptq_bf16kv_varlen.yml |  2 +-
 .../configs/gptq_bf16kv_varlen_tp2.yml        |  2 +-
 .../configs/gptq_marlin_bf16kv_varlen.yml     |  2 +-
 .../configs/gptq_marlin_w4_bf16kv_static.yml  | 47 ++++++++++++
 .../configs/gptq_marlin_w4_bf16kv_varlen.yml  |  2 +-
 .../configs/gptq_marlin_w8_bf16kv_static.yml  | 47 ++++++++++++
 .../configs/gptq_marlin_w8_bf16kv_varlen.yml  |  2 +-
 .../configs/gptq_w2_bf16kv_varlen.yml         |  2 +-
 .../configs/gptq_w8_bf16kv_varlen.yml         |  2 +-
 .../configs/w4a16_bf16kv_static.yml           |  2 +-
 .../configs/w4a16_bf16kv_varlen.yml           |  2 +-
 diffulex_bench/configs/w4a16_fp8kv_static.yml |  2 +-
 diffulex_bench/configs/w4a16_fp8kv_varlen.yml |  2 +-
 diffulex_bench/configs/w4a8_bf16kv_static.yml |  2 +-
 diffulex_bench/configs/w4a8_bf16kv_varlen.yml |  2 +-
 diffulex_bench/configs/w4a8_fp8kv_static.yml  |  2 +-
 diffulex_bench/configs/w4a8_fp8kv_varlen.yml  |  2 +-
 .../configs/w8a16_bf16kv_static.yml           |  2 +-
 .../configs/w8a16_bf16kv_varlen.yml           |  2 +-
 diffulex_bench/configs/w8a16_fp8kv_static.yml |  2 +-
 diffulex_bench/configs/w8a16_fp8kv_varlen.yml |  2 +-
 diffulex_bench/configs/w8a8_bf16kv_static.yml |  2 +-
 diffulex_bench/configs/w8a8_bf16kv_varlen.yml |  2 +-
 diffulex_bench/configs/w8a8_fp8kv_static.yml  |  2 +-
 diffulex_bench/configs/w8a8_fp8kv_varlen.yml  |  2 +-
 diffulex_bench/main.py                        | 22 ++++++
 diffulex_kernel/python/kv_cache_kernels.py    | 14 ++++
 diffulex_legacy/config.py                     |  2 +-
 46 files changed, 322 insertions(+), 72 deletions(-)
 create mode 100644 diffulex_bench/configs/awq_bf16kv_static.yml
 create mode 100644 diffulex_bench/configs/gptq_marlin_w4_bf16kv_static.yml
 create mode 100644 diffulex_bench/configs/gptq_marlin_w8_bf16kv_static.yml

diff --git a/diffulex/config.py b/diffulex/config.py
index 1086223..99f6c50 100755
--- a/diffulex/config.py
+++ b/diffulex/config.py
@@ -32,7 +32,8 @@ class Config:
     # Distributed comm (per tensor-parallel group). When using multiple DP
     # replicas on one host, assign unique master_port per replica.
     master_addr: str = "localhost"
-    master_port: int = 2333
+    # Allow overriding to avoid port collisions in multi-run/CI environments.
+    master_port: int = int(os.environ.get("DIFFULEX_MASTER_PORT", "2333"))
     # Shared memory segment name for intra-TP RPC; must be unique per DP group.
     shm_name: str = "diffulex_shm"
     # Start device index for this TP group (set by DP launcher).
diff --git a/diffulex/layer/linear.py b/diffulex/layer/linear.py
index 9dbef0e..fd16eb3 100755
--- a/diffulex/layer/linear.py
+++ b/diffulex/layer/linear.py
@@ -432,7 +432,9 @@ def __call__(self, x: torch.Tensor) -> torch.Tensor:
         if not reshaped_x.is_contiguous():
             reshaped_x = reshaped_x.contiguous()
 
-        out = self._awq_gemm(reshaped_x, self._qweight, self._qzeros, self._scales, self._split_k_iters)
+        # vLLM AWQ GEMM entrypoints (C++ op and Triton fallback) use the same order:
+        #   awq_gemm(input, qweight, scales, qzeros, split_k_iters)
+        out = self._awq_gemm(reshaped_x, self._qweight, self._scales, self._qzeros, self._split_k_iters)
         if self._bias is not None:
             out.add_(self._bias.to(dtype=out.dtype))
         out = out.reshape(x.shape[:-1] + (self._out_features,))
@@ -1978,8 +1980,9 @@ def _forward_base(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.
             return F.linear(x, weight, bias)
 
         weight = getattr(self, "weight", None)
-        if weight is None:
-            raise RuntimeError("Strategy is configured but weight is missing (expected bf16 weight).")
+        # NOTE: For offline-quantized strategies (e.g. GPTQ/AWQ/Marlin), the original
+        # bf16 weight may be intentionally removed after loading to save memory.
+        # In that case, the quantization strategy must be able to run without it.
         kwargs = self._maybe_int4_original_in_features_kwargs(strategy, x)
         if kwargs:
             return strategy.linear_forward(x, weight, bias, quant_kind=self.quant_kind, **kwargs)
diff --git a/diffulex/strategy/d2f/engine/model_runner.py b/diffulex/strategy/d2f/engine/model_runner.py
index 839c848..9a020a9 100644
--- a/diffulex/strategy/d2f/engine/model_runner.py
+++ b/diffulex/strategy/d2f/engine/model_runner.py
@@ -294,30 +294,41 @@ def run_model(self, input_ids: torch.Tensor, positions: torch.Tensor, is_prefill
             return self.model.compute_logits(self.model(input_ids, positions))
         num_tokens = input_ids.size(0)
         context = fetch_d2f_attn_metadata()
-        bucket_tokens = next(x for x in self.graph_bs if x >= num_tokens)
+        candidates = [x for x in self.graph_bs if x >= num_tokens]
+        if not candidates:
+            # Safety: fall back if capture didn't include a large-enough bucket.
+            return self.model.compute_logits(self.model(input_ids, positions))
+        bucket_tokens = candidates[0]
         graph = self.graphs[bucket_tokens]
         graph_vars = self.graph_vars
-        for key, value in graph_vars.items():
-            if key != "outputs":
-                value.zero_()
+        # Safety: fall back if runtime batch exceeds captured metadata capacity.
+        num_seqs = int(context.context_lens.numel())
+        max_num_seqs_for_graph = int(graph_vars["context_lens"].numel())
+        if num_seqs > max_num_seqs_for_graph:
+            return self.model.compute_logits(self.model(input_ids, positions))
+
+        # Reset buffers to safe defaults (avoid "0" being interpreted as a valid index).
+        graph_vars["input_ids"].zero_()
+        graph_vars["positions"].zero_()
+        graph_vars["slot_mapping"].fill_(-1)
+        graph_vars["context_lens"].zero_()
+        graph_vars["block_tables"].fill_(-1)
         graph_vars["input_ids"][:num_tokens] = input_ids
         graph_vars["positions"][:num_tokens] = positions
         graph_vars["slot_mapping"][:num_tokens] = context.slot_mapping
-        num_seqs = int(context.context_lens.numel())
         graph_vars["context_lens"][:num_seqs] = context.context_lens
         # cu_seqlens are required by unified paged-attn decode kernels.
         if getattr(context, "cu_seqlens_q", None) is not None:
+            # Pad to captured length so "extra" sequences become 0-length.
+            graph_vars["cu_seqlens_q"].fill_(int(num_tokens))
             graph_vars["cu_seqlens_q"][: num_seqs + 1] = context.cu_seqlens_q
-            bucket_num_seqs = int(bucket_tokens // max(1, int(self.diffusion_block_size)))
-            if bucket_num_seqs > num_seqs:
-                graph_vars["cu_seqlens_q"][num_seqs + 1 : bucket_num_seqs + 1].fill_(int(num_tokens))
         if getattr(context, "cu_seqlens_k", None) is not None:
+            last_k = int(context.cu_seqlens_k[num_seqs].item())
+            graph_vars["cu_seqlens_k"].fill_(last_k)
             graph_vars["cu_seqlens_k"][: num_seqs + 1] = context.cu_seqlens_k
-            bucket_num_seqs = int(bucket_tokens // max(1, int(self.diffusion_block_size)))
-            if bucket_num_seqs > num_seqs:
-                last_k = context.cu_seqlens_k[num_seqs]
-                graph_vars["cu_seqlens_k"][num_seqs + 1 : bucket_num_seqs + 1] = last_k
-        graph_vars["block_tables"][:num_seqs, : context.block_tables.size(1)] = context.block_tables
+
+        bt_cols = min(int(graph_vars["block_tables"].size(1)), int(context.block_tables.size(1)))
+        graph_vars["block_tables"][:num_seqs, :bt_cols] = context.block_tables[:, :bt_cols]
         graph.replay()
         return self.model.compute_logits(graph_vars["outputs"][:num_tokens])
 
@@ -355,8 +366,14 @@ def capture_cudagraph(self):
         diffusion_block_size = int(self.diffusion_block_size)
         max_num_seqs = int(self.config.max_num_seqs)
         # Graph path is only used when num_tokens <= 512.
-        max_num_seqs_for_graph = max(1, min(max_num_seqs, 512 // max(1, diffusion_block_size)))
-        max_num_tokens = max_num_seqs_for_graph * diffusion_block_size
+        #
+        # IMPORTANT:
+        # In D2F decode, `num_tokens` (sum of per-seq seqlen_q) is NOT guaranteed to equal
+        # `num_seqs * diffusion_block_size`. A single seq can contribute multiple diffusion blocks,
+        # so we must bucket by `num_tokens` directly and keep metadata tensors sized by
+        # `max_num_seqs_for_graph` (padding unused seqs to 0-length via cu_seqlens).
+        max_num_seqs_for_graph = max(1, min(max_num_seqs, 512))
+        max_num_tokens = 512
         max_num_blocks = (config.max_model_len + self.block_size - 1) // self.block_size
 
         # Allocate graph buffers on the same device/dtype as the model.
@@ -371,33 +388,38 @@ def capture_cudagraph(self):
         # Allocate max-size graph buffers.
         input_ids = torch.zeros(max_num_tokens, dtype=torch.int64, device=graph_device)
         positions = torch.zeros(max_num_tokens, dtype=torch.int64, device=graph_device)
-        slot_mapping = torch.zeros(max_num_tokens, dtype=torch.int32, device=graph_device)
+        slot_mapping = torch.full((max_num_tokens,), -1, dtype=torch.int32, device=graph_device)
         context_lens = torch.zeros(max_num_seqs_for_graph, dtype=torch.int32, device=graph_device)
-        block_tables = torch.zeros(max_num_seqs_for_graph, max_num_blocks, dtype=torch.int32, device=graph_device)
+        block_tables = torch.full((max_num_seqs_for_graph, max_num_blocks), -1, dtype=torch.int32, device=graph_device)
         outputs = torch.zeros(max_num_tokens, hf_config.hidden_size, dtype=graph_dtype, device=graph_device)
         cu_seqlens_q = torch.zeros(max_num_seqs_for_graph + 1, dtype=torch.int32, device=graph_device)
         cu_seqlens_k = torch.zeros(max_num_seqs_for_graph + 1, dtype=torch.int32, device=graph_device)
 
-        # Capture bucketed graphs by num_tokens (bucketed by num_seqs * diffusion_block_size).
+        # Capture bucketed graphs by total num_tokens.
         self.graph_bs = []
-        seq_bs_list = [1, 2, 4, 8] + list(range(16, max_num_seqs_for_graph + 1, 16))
-        for num_seqs in sorted(set([b for b in seq_bs_list if b <= max_num_seqs_for_graph] + [max_num_seqs_for_graph])):
-            self.graph_bs.append(int(num_seqs) * diffusion_block_size)
+        # Keep buckets aligned to diffusion_block_size for stable kernel shapes.
+        for t in range(diffusion_block_size, max_num_tokens + 1, diffusion_block_size):
+            self.graph_bs.append(int(t))
         self.graphs = {}
         self.graph_pool = None
 
         for num_tokens in tqdm(reversed(self.graph_bs), desc="Capturing CUDA graphs"):
-            num_seqs = int(num_tokens // diffusion_block_size)
+            num_seqs = int(max_num_seqs_for_graph)
             graph = torch.cuda.CUDAGraph()
             # Fill placeholder metadata with valid monotonic cu_seqlens to satisfy kernel assertions.
-            cu_seqlens_q[: num_seqs + 1] = (
-                torch.arange(num_seqs + 1, dtype=torch.int32, device=graph_device) * diffusion_block_size
-            )
+            # IMPORTANT: cu_seqlens_q must be non-decreasing and end at `num_tokens`
+            # (it is used to index into Q/slot_mapping which are length `num_tokens`).
+            # Use a simple placeholder: put all Q tokens into the first seq and make
+            # the remaining seqs 0-length.
+            cu_seqlens_q[: num_seqs + 1].fill_(int(num_tokens))
+            cu_seqlens_q[0] = 0
             # Use a conservative max-seqlen for K to keep shapes stable; values are overwritten before replay.
             cu_seqlens_k[: num_seqs + 1] = (
                 torch.arange(num_seqs + 1, dtype=torch.int32, device=graph_device) * int(config.max_model_len)
             )
             context_lens[:num_seqs].fill_(int(config.max_model_len))
+            # Use a benign placeholder block table for the first seq.
+            block_tables[:1].zero_()
             # For static decode, use placeholder metadata tensors; per-step values are copied
             # into `graph_vars` before replay.
             set_d2f_attn_metadata(
@@ -406,7 +428,7 @@ def capture_cudagraph(self):
                 context_lens=context_lens[:num_seqs],
                 cu_seqlens_q=cu_seqlens_q[: num_seqs + 1],
                 cu_seqlens_k=cu_seqlens_k[: num_seqs + 1],
-                max_seqlen_q=diffusion_block_size,
+                max_seqlen_q=int(num_tokens),
                 max_seqlen_k=int(config.max_model_len),
                 block_tables=block_tables[:num_seqs],
                 kv_cache_layout=self.config.kv_cache_layout,
diff --git a/diffulex/utils/loader.py b/diffulex/utils/loader.py
index 73ffb92..b78f788 100755
--- a/diffulex/utils/loader.py
+++ b/diffulex/utils/loader.py
@@ -144,6 +144,22 @@ def _set_offline_gptq_marlin_weight(
     module._offline_quant_out_features = torch.tensor(out_features, dtype=torch.int32, device=module_device)
     module._offline_quant_in_features = torch.tensor(in_features, dtype=torch.int32, device=module_device)
     module._gptq_is_shuffled = torch.tensor(False, dtype=torch.bool, device=module_device)
+    # Keep Python-side mirrors in sync; runtime fast paths rely on these and
+    # must not `.item()` from CUDA tensors (graph capture / perf).
+    if hasattr(module, "_offline_quant_format_py"):
+        module._offline_quant_format_py = 1
+    if hasattr(module, "_offline_quant_bits_py"):
+        module._offline_quant_bits_py = int(bits)
+    if hasattr(module, "_offline_quant_group_size_py"):
+        module._offline_quant_group_size_py = int(group_size)
+    if hasattr(module, "_offline_quant_out_features_py"):
+        module._offline_quant_out_features_py = int(out_features)
+    if hasattr(module, "_offline_quant_in_features_py"):
+        module._offline_quant_in_features_py = int(in_features)
+    if hasattr(module, "_gptq_is_shuffled_py"):
+        module._gptq_is_shuffled_py = False
+    if hasattr(module, "_gptq_marlin_is_prepared_py"):
+        module._gptq_marlin_is_prepared_py = False
 
     # Reset marlin-prep caches (workspace/zp/g_idx meta will be created on first forward).
     module._gptq_marlin_is_prepared = torch.tensor(False, dtype=torch.bool, device=module_device)
diff --git a/diffulex/utils/quantization/strategies/linear_awq_w4a16.py b/diffulex/utils/quantization/strategies/linear_awq_w4a16.py
index 7090d59..ea6675d 100644
--- a/diffulex/utils/quantization/strategies/linear_awq_w4a16.py
+++ b/diffulex/utils/quantization/strategies/linear_awq_w4a16.py
@@ -23,6 +23,12 @@
 except Exception:  # pragma: no cover
     ops = None  # type: ignore
 
+try:
+    # Triton fallback path for AWQ GEMM (works even when C++/CUDA ops are not built).
+    from vllm.model_executor.layers.quantization.awq_triton import awq_gemm_triton  # type: ignore
+except Exception:  # pragma: no cover
+    awq_gemm_triton = None  # type: ignore
+
 
 @register_linear_strategy(weight_dtype="awq", act_dtype="bf16")
 def _build_linear_awq_w4a16() -> LinearQuantizationStrategy:
@@ -32,17 +38,23 @@ def _build_linear_awq_w4a16() -> LinearQuantizationStrategy:
 class LinearAWQW4A16Strategy(LinearQuantizationStrategy):
     def __init__(self) -> None:
         super().__init__()
-        # Resolve the concrete kernel entry point once (avoid per-call dispatch).
+        # Resolve the concrete kernel entry points once (avoid per-call dispatch).
+        self._awq_gemm_cpp = None
+        self._awq_gemm_triton = awq_gemm_triton
+
         awq_gemm = None
         try:
             if hasattr(torch.ops, "_C") and hasattr(torch.ops._C, "awq_gemm"):
                 awq_gemm = torch.ops._C.awq_gemm
         except Exception:
             awq_gemm = None
-        if awq_gemm is None and ops is not None and hasattr(ops, "awq_gemm"):
-            awq_gemm = ops.awq_gemm
-        self._awq_gemm = awq_gemm
-        self._ops_available: bool = bool(self._awq_gemm is not None)
+        # Prefer the real C++ op if present; otherwise keep `None` and fall back to Triton.
+        self._awq_gemm_cpp = awq_gemm
+        # Keep the python wrapper as a last resort (it may route to Triton or to torch.ops._C).
+        self._awq_gemm_py = ops.awq_gemm if (ops is not None and hasattr(ops, "awq_gemm")) else None
+        self._ops_available: bool = bool(
+            self._awq_gemm_cpp is not None or self._awq_gemm_triton is not None or self._awq_gemm_py is not None
+        )
 
     @property
     def name(self) -> str:
@@ -122,9 +134,21 @@ def linear_forward(
         reshaped_x = x_in.reshape(-1, x_in.shape[-1])
 
         # Always use awq_gemm to avoid large temporary dequantized weight allocations.
-        # vLLM API: awq_gemm(input, qweight, qzeros, scales, split_k_iters)
+        # vLLM API:
+        # - C++ op: awq_gemm(input, qweight, scales, qzeros, split_k_iters)
+        # - Triton : awq_gemm_triton(input, qweight, scales, qzeros, split_k_iters)
         split_k_iters = 1
-        out = self._awq_gemm(reshaped_x, qweight, qzeros, scales, split_k_iters)  # type: ignore[misc]
+        if self._awq_gemm_triton is not None:
+            out = self._awq_gemm_triton(reshaped_x, qweight, scales, qzeros, split_k_iters)  # type: ignore[misc]
+        elif self._awq_gemm_cpp is not None:
+            out = self._awq_gemm_cpp(reshaped_x, qweight, scales, qzeros, split_k_iters)  # type: ignore[misc]
+        elif self._awq_gemm_py is not None:
+            out = self._awq_gemm_py(reshaped_x, qweight, scales, qzeros, split_k_iters)  # type: ignore[misc]
+        else:
+            raise RuntimeError(
+                "vLLM is required for AWQ W4A16 but no available kernel entry point was found "
+                "(missing both Triton and C++ awq_gemm)."
+            )
 
         if bias is not None:
             out.add_(bias.to(dtype=out.dtype))
diff --git a/diffulex_bench/arg_parser.py b/diffulex_bench/arg_parser.py
index c0978ed..d4f786c 100644
--- a/diffulex_bench/arg_parser.py
+++ b/diffulex_bench/arg_parser.py
@@ -210,6 +210,13 @@ def create_argument_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Enforce eager mode (disable CUDA graphs)",
     )
+    parser.add_argument(
+        "--no-enforce-eager",
+        dest="enforce_eager",
+        action="store_false",
+        help="Disable eager mode (enable CUDA graphs when supported)",
+    )
+    parser.set_defaults(enforce_eager=None)
     parser.add_argument(
         "--kv-cache-layout",
         type=str,
diff --git a/diffulex_bench/configs/awq_bf16kv_static.yml b/diffulex_bench/configs/awq_bf16kv_static.yml
new file mode 100644
index 0000000..4cdb2fa
--- /dev/null
+++ b/diffulex_bench/configs/awq_bf16kv_static.yml
@@ -0,0 +1,47 @@
+# AWQ (W4A16) + BF16 KV Cache (static mode)
+engine:
+  model_path: "/data1/lzx/Dream-v0-Base-7B-awq"
+  tokenizer_path: null
+  model_name: "dream"
+  decoding_strategy: "d2f"
+  mask_token_id: 151666
+  
+  use_lora: false
+  lora_path: ""
+  
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  
+  gpu_memory_utilization: 0.7
+  max_model_len: 4096
+  max_num_batched_tokens: 4096
+  max_num_seqs: 128
+  
+  enforce_eager: true
+  kv_cache_layout: "unified"
+  
+  accept_threshold: 0.9
+  complete_threshold: 0.95
+  add_new_block_threshold: 0.1
+  diffusion_block_size: 32
+  
+  # Quantization: AWQ (W4A16) + BF16 activations + BF16 KV cache
+  kv_cache_dtype: "bf16"
+  decode_mode: "static"
+  linear_attn_weight_dtype: "awq"
+  linear_mlp_weight_dtype: "awq"
+  linear_attn_act_dtype: "bf16"
+  linear_mlp_act_dtype: "bf16"
+
+eval:
+  dataset_name: "gsm8k"
+  dataset_split: "test"
+  dataset_limit: 10
+  
+  temperature: 0.0
+  max_tokens: 512
+  ignore_eos: false
+  
+  output_dir: "benchmark_results_static/awq_bf16kv"
+  save_results: true
+  use_tqdm: true
diff --git a/diffulex_bench/configs/awq_bf16kv_varlen.yml b/diffulex_bench/configs/awq_bf16kv_varlen.yml
index 62c2cb8..6ae2e46 100644
--- a/diffulex_bench/configs/awq_bf16kv_varlen.yml
+++ b/diffulex_bench/configs/awq_bf16kv_varlen.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml b/diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml
index 8c76f4e..c27e4ec 100644
--- a/diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml
+++ b/diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/bf16_bf16kv_distinct.yml b/diffulex_bench/configs/bf16_bf16kv_distinct.yml
index 1800ef2..5cf750c 100644
--- a/diffulex_bench/configs/bf16_bf16kv_distinct.yml
+++ b/diffulex_bench/configs/bf16_bf16kv_distinct.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/bf16_bf16kv_static.yml b/diffulex_bench/configs/bf16_bf16kv_static.yml
index c83e028..d36e39d 100644
--- a/diffulex_bench/configs/bf16_bf16kv_static.yml
+++ b/diffulex_bench/configs/bf16_bf16kv_static.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/bf16_bf16kv_varlen.yml b/diffulex_bench/configs/bf16_bf16kv_varlen.yml
index 4a6b794..8258035 100644
--- a/diffulex_bench/configs/bf16_bf16kv_varlen.yml
+++ b/diffulex_bench/configs/bf16_bf16kv_varlen.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/bf16_fp8kv_distinct.yml b/diffulex_bench/configs/bf16_fp8kv_distinct.yml
index 4cbbb8e..bc0fdd5 100644
--- a/diffulex_bench/configs/bf16_fp8kv_distinct.yml
+++ b/diffulex_bench/configs/bf16_fp8kv_distinct.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/bf16_fp8kv_static.yml b/diffulex_bench/configs/bf16_fp8kv_static.yml
index ff429df..ee0af7f 100644
--- a/diffulex_bench/configs/bf16_fp8kv_static.yml
+++ b/diffulex_bench/configs/bf16_fp8kv_static.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/bf16_fp8kv_varlen.yml b/diffulex_bench/configs/bf16_fp8kv_varlen.yml
index bcfbc9f..973ec91 100644
--- a/diffulex_bench/configs/bf16_fp8kv_varlen.yml
+++ b/diffulex_bench/configs/bf16_fp8kv_varlen.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/dream_d2f_gsm8k.yml b/diffulex_bench/configs/dream_d2f_gsm8k.yml
index e55b9be..74d1b07 100644
--- a/diffulex_bench/configs/dream_d2f_gsm8k.yml
+++ b/diffulex_bench/configs/dream_d2f_gsm8k.yml
@@ -10,7 +10,7 @@ engine:
   tensor_parallel_size: 1
   data_parallel_size: 1
   gpu_memory_utilization: 0.9
-  max_model_len: 2048
+  max_model_len: 4096
   
   use_lora: false
   enforce_eager: false
diff --git a/diffulex_bench/configs/example.yml b/diffulex_bench/configs/example.yml
index 41f0839..bbdcbc5 100644
--- a/diffulex_bench/configs/example.yml
+++ b/diffulex_bench/configs/example.yml
@@ -20,7 +20,7 @@ engine:
   
   # Memory and capacity configuration
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/fp8_bf16kv_varlen.yml b/diffulex_bench/configs/fp8_bf16kv_varlen.yml
index 2ac105b..f6fb081 100644
--- a/diffulex_bench/configs/fp8_bf16kv_varlen.yml
+++ b/diffulex_bench/configs/fp8_bf16kv_varlen.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/gptq_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_bf16kv_varlen.yml
index b7fd14d..3ff8759 100644
--- a/diffulex_bench/configs/gptq_bf16kv_varlen.yml
+++ b/diffulex_bench/configs/gptq_bf16kv_varlen.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml b/diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml
index 1505192..4eb16cd 100644
--- a/diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml
+++ b/diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml
index 858b31a..06d9733 100644
--- a/diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml
+++ b/diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/gptq_marlin_w4_bf16kv_static.yml b/diffulex_bench/configs/gptq_marlin_w4_bf16kv_static.yml
new file mode 100644
index 0000000..8ba23c3
--- /dev/null
+++ b/diffulex_bench/configs/gptq_marlin_w4_bf16kv_static.yml
@@ -0,0 +1,47 @@
+# GPTQ Marlin (W4, A16) + BF16 KV Cache (static mode)
+engine:
+  model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-marlin-w4"
+  tokenizer_path: null
+  model_name: "dream"
+  decoding_strategy: "d2f"
+  mask_token_id: 151666
+  
+  use_lora: false
+  lora_path: ""
+  
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  
+  gpu_memory_utilization: 0.7
+  max_model_len: 4096
+  max_num_batched_tokens: 4096
+  max_num_seqs: 128
+  
+  enforce_eager: false  # Enable CUDA Graph for static mode
+  kv_cache_layout: "unified"
+  
+  accept_threshold: 0.9
+  complete_threshold: 0.95
+  add_new_block_threshold: 0.1
+  diffusion_block_size: 32
+  
+  # Quantization: GPTQ Marlin (W4) + BF16 activations + BF16 KV cache
+  kv_cache_dtype: "bf16"
+  decode_mode: "static"
+  linear_attn_weight_dtype: "gptq_marlin"
+  linear_mlp_weight_dtype: "gptq_marlin"
+  linear_attn_act_dtype: "bf16"
+  linear_mlp_act_dtype: "bf16"
+
+eval:
+  dataset_name: "gsm8k"
+  dataset_split: "test"
+  dataset_limit: 10
+  
+  temperature: 0.0
+  max_tokens: 512
+  ignore_eos: false
+  
+  output_dir: "benchmark_results_static/gptq_marlin_w4_bf16kv"
+  save_results: true
+  use_tqdm: true
diff --git a/diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml
index f8265d3..3702baf 100644
--- a/diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml
+++ b/diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/gptq_marlin_w8_bf16kv_static.yml b/diffulex_bench/configs/gptq_marlin_w8_bf16kv_static.yml
new file mode 100644
index 0000000..06bb08b
--- /dev/null
+++ b/diffulex_bench/configs/gptq_marlin_w8_bf16kv_static.yml
@@ -0,0 +1,47 @@
+# GPTQ Marlin (W8, A16) + BF16 KV Cache (static mode)
+engine:
+  model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-marlin-w8"
+  tokenizer_path: null
+  model_name: "dream"
+  decoding_strategy: "d2f"
+  mask_token_id: 151666
+  
+  use_lora: false
+  lora_path: ""
+  
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  
+  gpu_memory_utilization: 0.7
+  max_model_len: 4096
+  max_num_batched_tokens: 4096
+  max_num_seqs: 128
+  
+  enforce_eager: true
+  kv_cache_layout: "unified"
+  
+  accept_threshold: 0.9
+  complete_threshold: 0.95
+  add_new_block_threshold: 0.1
+  diffusion_block_size: 32
+  
+  # Quantization: GPTQ Marlin (W8) + BF16 activations + BF16 KV cache
+  kv_cache_dtype: "bf16"
+  decode_mode: "static"
+  linear_attn_weight_dtype: "gptq_marlin"
+  linear_mlp_weight_dtype: "gptq_marlin"
+  linear_attn_act_dtype: "bf16"
+  linear_mlp_act_dtype: "bf16"
+
+eval:
+  dataset_name: "gsm8k"
+  dataset_split: "test"
+  dataset_limit: 10
+  
+  temperature: 0.0
+  max_tokens: 512
+  ignore_eos: false
+  
+  output_dir: "benchmark_results_static/gptq_marlin_w8_bf16kv"
+  save_results: true
+  use_tqdm: true
diff --git a/diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml
index e20c9be..da2cfdc 100644
--- a/diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml
+++ b/diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml
index 03fe3e7..0e60faa 100644
--- a/diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml
+++ b/diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml
index 1f68616..b1bf8ad 100644
--- a/diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml
+++ b/diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/w4a16_bf16kv_static.yml b/diffulex_bench/configs/w4a16_bf16kv_static.yml
index 79d9825..c8a2d95 100644
--- a/diffulex_bench/configs/w4a16_bf16kv_static.yml
+++ b/diffulex_bench/configs/w4a16_bf16kv_static.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/w4a16_bf16kv_varlen.yml b/diffulex_bench/configs/w4a16_bf16kv_varlen.yml
index 52230fc..609dd3d 100644
--- a/diffulex_bench/configs/w4a16_bf16kv_varlen.yml
+++ b/diffulex_bench/configs/w4a16_bf16kv_varlen.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/w4a16_fp8kv_static.yml b/diffulex_bench/configs/w4a16_fp8kv_static.yml
index 22225a1..8f707a6 100644
--- a/diffulex_bench/configs/w4a16_fp8kv_static.yml
+++ b/diffulex_bench/configs/w4a16_fp8kv_static.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/w4a16_fp8kv_varlen.yml b/diffulex_bench/configs/w4a16_fp8kv_varlen.yml
index c1b943f..bf7381b 100644
--- a/diffulex_bench/configs/w4a16_fp8kv_varlen.yml
+++ b/diffulex_bench/configs/w4a16_fp8kv_varlen.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/w4a8_bf16kv_static.yml b/diffulex_bench/configs/w4a8_bf16kv_static.yml
index 841050e..4741aa5 100644
--- a/diffulex_bench/configs/w4a8_bf16kv_static.yml
+++ b/diffulex_bench/configs/w4a8_bf16kv_static.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/w4a8_bf16kv_varlen.yml b/diffulex_bench/configs/w4a8_bf16kv_varlen.yml
index 4df0089..8ce0145 100644
--- a/diffulex_bench/configs/w4a8_bf16kv_varlen.yml
+++ b/diffulex_bench/configs/w4a8_bf16kv_varlen.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/w4a8_fp8kv_static.yml b/diffulex_bench/configs/w4a8_fp8kv_static.yml
index 1676393..08da846 100644
--- a/diffulex_bench/configs/w4a8_fp8kv_static.yml
+++ b/diffulex_bench/configs/w4a8_fp8kv_static.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/w4a8_fp8kv_varlen.yml b/diffulex_bench/configs/w4a8_fp8kv_varlen.yml
index 4725d6a..8dd80ec 100644
--- a/diffulex_bench/configs/w4a8_fp8kv_varlen.yml
+++ b/diffulex_bench/configs/w4a8_fp8kv_varlen.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/w8a16_bf16kv_static.yml b/diffulex_bench/configs/w8a16_bf16kv_static.yml
index 9ba90fb..7f54b1c 100644
--- a/diffulex_bench/configs/w8a16_bf16kv_static.yml
+++ b/diffulex_bench/configs/w8a16_bf16kv_static.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/w8a16_bf16kv_varlen.yml b/diffulex_bench/configs/w8a16_bf16kv_varlen.yml
index 4b50d5f..9c0efaa 100644
--- a/diffulex_bench/configs/w8a16_bf16kv_varlen.yml
+++ b/diffulex_bench/configs/w8a16_bf16kv_varlen.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/w8a16_fp8kv_static.yml b/diffulex_bench/configs/w8a16_fp8kv_static.yml
index 9771043..27243b9 100644
--- a/diffulex_bench/configs/w8a16_fp8kv_static.yml
+++ b/diffulex_bench/configs/w8a16_fp8kv_static.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/w8a16_fp8kv_varlen.yml b/diffulex_bench/configs/w8a16_fp8kv_varlen.yml
index e282a27..ddd04ab 100644
--- a/diffulex_bench/configs/w8a16_fp8kv_varlen.yml
+++ b/diffulex_bench/configs/w8a16_fp8kv_varlen.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/w8a8_bf16kv_static.yml b/diffulex_bench/configs/w8a8_bf16kv_static.yml
index bd9753d..e34456c 100644
--- a/diffulex_bench/configs/w8a8_bf16kv_static.yml
+++ b/diffulex_bench/configs/w8a8_bf16kv_static.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/w8a8_bf16kv_varlen.yml b/diffulex_bench/configs/w8a8_bf16kv_varlen.yml
index e1d9ecb..57e919b 100644
--- a/diffulex_bench/configs/w8a8_bf16kv_varlen.yml
+++ b/diffulex_bench/configs/w8a8_bf16kv_varlen.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.5
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 2048
   max_num_seqs: 64
   
diff --git a/diffulex_bench/configs/w8a8_fp8kv_static.yml b/diffulex_bench/configs/w8a8_fp8kv_static.yml
index 30f71ca..da5b9c6 100644
--- a/diffulex_bench/configs/w8a8_fp8kv_static.yml
+++ b/diffulex_bench/configs/w8a8_fp8kv_static.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/configs/w8a8_fp8kv_varlen.yml b/diffulex_bench/configs/w8a8_fp8kv_varlen.yml
index 0467144..1ae985b 100644
--- a/diffulex_bench/configs/w8a8_fp8kv_varlen.yml
+++ b/diffulex_bench/configs/w8a8_fp8kv_varlen.yml
@@ -13,7 +13,7 @@ engine:
   data_parallel_size: 1
   
   gpu_memory_utilization: 0.7
-  max_model_len: 2048
+  max_model_len: 4096
   max_num_batched_tokens: 4096
   max_num_seqs: 128
   
diff --git a/diffulex_bench/main.py b/diffulex_bench/main.py
index 15bac16..f6a7ae8 100644
--- a/diffulex_bench/main.py
+++ b/diffulex_bench/main.py
@@ -206,12 +206,34 @@ def load_config_from_args(args) -> BenchmarkConfig:
         # Override with command line arguments if provided
         if args.model_path:
             config.engine.model_path = args.model_path
+        if getattr(args, "tokenizer_path", None):
+            config.engine.tokenizer_path = args.tokenizer_path
         if args.dataset:
             config.eval.dataset_name = args.dataset
         if args.dataset_limit is not None:
             config.eval.dataset_limit = args.dataset_limit
+        if getattr(args, "max_tokens", None) is not None:
+            config.eval.max_tokens = args.max_tokens
+        if getattr(args, "temperature", None) is not None:
+            config.eval.temperature = args.temperature
         if args.output_dir:
             config.eval.output_dir = args.output_dir
+
+        # Engine overrides (make bench configs reusable for eager vs CUDA Graph comparisons)
+        if getattr(args, "enforce_eager", None) is not None:
+            config.engine.enforce_eager = bool(args.enforce_eager)
+        if getattr(args, "kv_cache_layout", None) is not None:
+            config.engine.kv_cache_layout = args.kv_cache_layout
+        if getattr(args, "decode_mode", None) is not None:
+            config.engine.decode_mode = args.decode_mode
+        if getattr(args, "kv_cache_dtype", None) is not None:
+            config.engine.kv_cache_dtype = args.kv_cache_dtype
+        if getattr(args, "max_model_len", None) is not None:
+            config.engine.max_model_len = args.max_model_len
+        if getattr(args, "max_num_seqs", None) is not None:
+            config.engine.max_num_seqs = args.max_num_seqs
+        if getattr(args, "max_num_batched_tokens", None) is not None:
+            config.engine.max_num_batched_tokens = args.max_num_batched_tokens
     else:
         if not args.model_path:
             logger.error("Either --config or --model-path must be provided")
diff --git a/diffulex_kernel/python/kv_cache_kernels.py b/diffulex_kernel/python/kv_cache_kernels.py
index 514c8fe..8010042 100755
--- a/diffulex_kernel/python/kv_cache_kernels.py
+++ b/diffulex_kernel/python/kv_cache_kernels.py
@@ -880,6 +880,20 @@ def store_kvcache_unified_layout(key: torch.Tensor, value: torch.Tensor,
     Store KV cache (unified layout).
     Dynamically selects the appropriate kernel based on quantization strategy from context.
     """
+    # `slot_mapping` is expected to have one entry per token in `key/value` (dimension 0).
+    # In some flows (e.g. prefix-cache / partial-prefill), metadata may carry a longer
+    # mapping for the full sequence while `key/value` only contain the suffix tokens
+    # actually computed this step. In that case, align by taking the tail.
+    N = int(key.shape[0])
+    if int(slot_mapping.numel()) != N:
+        if int(slot_mapping.numel()) > N:
+            slot_mapping = slot_mapping[-N:]
+        else:
+            raise AssertionError(
+                f"slot_mapping is shorter than key/value tokens: "
+                f"N={N}, slot_mapping.numel()={int(slot_mapping.numel())}"
+            )
+
     from diffulex.utils.quantization.context import get_kv_cache_strategy
     strategy = get_kv_cache_strategy()
     if strategy is None:
diff --git a/diffulex_legacy/config.py b/diffulex_legacy/config.py
index a5b1dd6..bd4ec71 100755
--- a/diffulex_legacy/config.py
+++ b/diffulex_legacy/config.py
@@ -29,7 +29,7 @@ class Config:
     # Distributed comm (per tensor-parallel group). When using multiple DP
     # replicas on one host, assign unique master_port per replica.
     master_addr: str = "localhost"
-    master_port: int = 2333
+    master_port: int = int(os.environ.get("DIFFULEX_MASTER_PORT", "2333"))
     # Shared memory segment name for intra-TP RPC; must be unique per DP group.
     shm_name: str = "d2f_vllm"
     # Start device index for this TP group (set by DP launcher).

From 8ea87175ea7f92a337672a790fd99e9751d52b2a Mon Sep 17 00:00:00 2001
From: luozixin2 <lzx@diffulex.local>
Date: Wed, 28 Jan 2026 02:32:52 +0000
Subject: [PATCH 10/10] =?UTF-8?q?chore:=20=E6=B8=85=E7=90=86=E5=AE=9E?=
 =?UTF-8?q?=E9=AA=8C=E9=85=8D=E7=BD=AE=E4=B8=8E=E7=8E=AF=E5=A2=83=E5=8F=98?=
 =?UTF-8?q?=E9=87=8F=E4=BE=9D=E8=B5=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 移除 v0.0.1 之后新增的 bench 配置与量化架构文档
- 将 W8A16/DP 等调参从 env 收敛到 Config/strategy.configure
- 示例/脚本去掉硬编码本机路径与默认 GPU，并修复语法问题
---
 .gitignore                                    |   1 +
 diffulex/config.py                            |   8 +-
 diffulex/engine/dp_worker.py                  |   4 +-
 diffulex/utils/quantization/factory.py        |   3 +
 .../strategies/linear_marlin_int8_w8a16.py    |  33 +-
 diffulex/utils/quantization/strategy.py       |  10 +
 diffulex_bench/configs/awq_bf16kv_static.yml  |  47 --
 diffulex_bench/configs/awq_bf16kv_varlen.yml  |  47 --
 .../configs/awq_marlin_bf16kv_varlen.yml      |  48 --
 .../configs/bf16_bf16kv_distinct.yml          |  47 --
 diffulex_bench/configs/bf16_bf16kv_static.yml |  47 --
 diffulex_bench/configs/bf16_bf16kv_varlen.yml |  47 --
 .../configs/bf16_fp8kv_distinct.yml           |  47 --
 diffulex_bench/configs/bf16_fp8kv_static.yml  |  47 --
 diffulex_bench/configs/bf16_fp8kv_varlen.yml  |  47 --
 diffulex_bench/configs/fp8_bf16kv_varlen.yml  |  48 --
 diffulex_bench/configs/gptq_bf16kv_varlen.yml |  47 --
 .../configs/gptq_bf16kv_varlen_tp2.yml        |  47 --
 .../configs/gptq_marlin_bf16kv_varlen.yml     |  48 --
 .../configs/gptq_marlin_w4_bf16kv_static.yml  |  47 --
 .../configs/gptq_marlin_w4_bf16kv_varlen.yml  |  47 --
 .../configs/gptq_marlin_w8_bf16kv_static.yml  |  47 --
 .../configs/gptq_marlin_w8_bf16kv_varlen.yml  |  47 --
 .../configs/gptq_w2_bf16kv_varlen.yml         |  47 --
 .../configs/gptq_w8_bf16kv_varlen.yml         |  47 --
 .../configs/w4a16_bf16kv_static.yml           |  47 --
 .../configs/w4a16_bf16kv_varlen.yml           |  47 --
 diffulex_bench/configs/w4a16_fp8kv_static.yml |  47 --
 diffulex_bench/configs/w4a16_fp8kv_varlen.yml |  47 --
 diffulex_bench/configs/w4a8_bf16kv_static.yml |  47 --
 diffulex_bench/configs/w4a8_bf16kv_varlen.yml |  47 --
 diffulex_bench/configs/w4a8_fp8kv_static.yml  |  47 --
 diffulex_bench/configs/w4a8_fp8kv_varlen.yml  |  47 --
 .../configs/w8a16_bf16kv_static.yml           |  47 --
 .../configs/w8a16_bf16kv_varlen.yml           |  47 --
 diffulex_bench/configs/w8a16_fp8kv_static.yml |  47 --
 diffulex_bench/configs/w8a16_fp8kv_varlen.yml |  47 --
 diffulex_bench/configs/w8a8_bf16kv_static.yml |  47 --
 diffulex_bench/configs/w8a8_bf16kv_varlen.yml |  47 --
 diffulex_bench/configs/w8a8_fp8kv_static.yml  |  47 --
 diffulex_bench/configs/w8a8_fp8kv_varlen.yml  |  47 --
 diffulex_legacy/config.py                     |   2 +-
 diffulex_legacy/engine/dp_engine.py           |   4 +-
 diffulex_profiler/example.py                  |  11 +-
 examples/test_dream_diffulex_gsm8k.py         |   3 +-
 examples/test_dream_dvllm_human_eval.py       |   3 +-
 examples/test_fastdllmv2_diffulex_gsm8k.py    |   3 +-
 examples/test_gptq_awq_loading.py             |  17 -
 examples/test_llada_dvllm_human_eval.py       |   3 +-
 examples/test_quantization_generation.py      |  36 +-
 examples/test_sdar_diffulex_gsm8k.py          |   3 +-
 examples/test_sdar_dvllm.py                   |   4 +-
 profile/torch_d2f_profiler.py                 |  30 +-
 quantization_architecture.md                  | 149 -----
 quantization_architecture_diagram.md          | 551 ------------------
 55 files changed, 93 insertions(+), 2433 deletions(-)
 delete mode 100644 diffulex_bench/configs/awq_bf16kv_static.yml
 delete mode 100644 diffulex_bench/configs/awq_bf16kv_varlen.yml
 delete mode 100644 diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml
 delete mode 100644 diffulex_bench/configs/bf16_bf16kv_distinct.yml
 delete mode 100644 diffulex_bench/configs/bf16_bf16kv_static.yml
 delete mode 100644 diffulex_bench/configs/bf16_bf16kv_varlen.yml
 delete mode 100644 diffulex_bench/configs/bf16_fp8kv_distinct.yml
 delete mode 100644 diffulex_bench/configs/bf16_fp8kv_static.yml
 delete mode 100644 diffulex_bench/configs/bf16_fp8kv_varlen.yml
 delete mode 100644 diffulex_bench/configs/fp8_bf16kv_varlen.yml
 delete mode 100644 diffulex_bench/configs/gptq_bf16kv_varlen.yml
 delete mode 100644 diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml
 delete mode 100644 diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml
 delete mode 100644 diffulex_bench/configs/gptq_marlin_w4_bf16kv_static.yml
 delete mode 100644 diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml
 delete mode 100644 diffulex_bench/configs/gptq_marlin_w8_bf16kv_static.yml
 delete mode 100644 diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml
 delete mode 100644 diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml
 delete mode 100644 diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml
 delete mode 100644 diffulex_bench/configs/w4a16_bf16kv_static.yml
 delete mode 100644 diffulex_bench/configs/w4a16_bf16kv_varlen.yml
 delete mode 100644 diffulex_bench/configs/w4a16_fp8kv_static.yml
 delete mode 100644 diffulex_bench/configs/w4a16_fp8kv_varlen.yml
 delete mode 100644 diffulex_bench/configs/w4a8_bf16kv_static.yml
 delete mode 100644 diffulex_bench/configs/w4a8_bf16kv_varlen.yml
 delete mode 100644 diffulex_bench/configs/w4a8_fp8kv_static.yml
 delete mode 100644 diffulex_bench/configs/w4a8_fp8kv_varlen.yml
 delete mode 100644 diffulex_bench/configs/w8a16_bf16kv_static.yml
 delete mode 100644 diffulex_bench/configs/w8a16_bf16kv_varlen.yml
 delete mode 100644 diffulex_bench/configs/w8a16_fp8kv_static.yml
 delete mode 100644 diffulex_bench/configs/w8a16_fp8kv_varlen.yml
 delete mode 100644 diffulex_bench/configs/w8a8_bf16kv_static.yml
 delete mode 100644 diffulex_bench/configs/w8a8_bf16kv_varlen.yml
 delete mode 100644 diffulex_bench/configs/w8a8_fp8kv_static.yml
 delete mode 100644 diffulex_bench/configs/w8a8_fp8kv_varlen.yml
 delete mode 100644 quantization_architecture.md
 delete mode 100644 quantization_architecture_diagram.md

diff --git a/.gitignore b/.gitignore
index 0a8ab01..76f8e70 100755
--- a/.gitignore
+++ b/.gitignore
@@ -54,5 +54,6 @@ GITHUB_ISSUE.md
 Tilelang-failed_test_cases/
 # Benchmark results
 benchmark_results/
+benchmark_results_tmp/
 # Cursor IDE files
 .cursor/
diff --git a/diffulex/config.py b/diffulex/config.py
index 99f6c50..f571f34 100755
--- a/diffulex/config.py
+++ b/diffulex/config.py
@@ -32,8 +32,7 @@ class Config:
     # Distributed comm (per tensor-parallel group). When using multiple DP
     # replicas on one host, assign unique master_port per replica.
     master_addr: str = "localhost"
-    # Allow overriding to avoid port collisions in multi-run/CI environments.
-    master_port: int = int(os.environ.get("DIFFULEX_MASTER_PORT", "2333"))
+    master_port: int = 2333
     # Shared memory segment name for intra-TP RPC; must be unique per DP group.
     shm_name: str = "diffulex_shm"
     # Start device index for this TP group (set by DP launcher).
@@ -60,6 +59,11 @@ class Config:
     linear_attn_act_dtype: str = "bf16"
     linear_mlp_act_dtype: str = "bf16"
 
+    # Kernel tuning knobs (avoid environment-variable based tuning in library code).
+    # Currently used by some W8A16 linear strategies.
+    linear_w8a16_quant_block_n: int = 256
+    linear_w8a16_allspark_cublas_m_threshold: int = 256
+
     def __post_init__(self):
         assert os.path.isdir(self.model)
         assert self.kvcache_block_size % 16 == 0
diff --git a/diffulex/engine/dp_worker.py b/diffulex/engine/dp_worker.py
index a76239a..968fa5f 100755
--- a/diffulex/engine/dp_worker.py
+++ b/diffulex/engine/dp_worker.py
@@ -125,12 +125,10 @@ def __init__(self, model, **kwargs):
         need_gpus = self.dp_size * cfg.tensor_parallel_size
         assert len(vis) >= need_gpus, f"Require {need_gpus} GPUs (dp={self.dp_size}, tp={cfg.tensor_parallel_size}), visible {len(vis)}"
 
-        # Optional overrides: kwargs['device_ids'] or env D2F_DEVICE_MAP
+        # Optional overrides: kwargs['device_ids']
         override = None
         if 'device_ids' in kwargs and kwargs['device_ids']:
             override = list(kwargs['device_ids'])
-        elif os.environ.get('D2F_DEVICE_MAP'):
-            override = [int(x) for x in os.environ['D2F_DEVICE_MAP'].split(',') if x.strip() != '']
         if override is not None:
             assert len(override) >= need_gpus, f"device_ids length {len(override)} < required {need_gpus}"
             # All override devices must be in visible list
diff --git a/diffulex/utils/quantization/factory.py b/diffulex/utils/quantization/factory.py
index 3b32f96..ee7e3b6 100644
--- a/diffulex/utils/quantization/factory.py
+++ b/diffulex/utils/quantization/factory.py
@@ -60,6 +60,7 @@ def create_from_config(config) -> QuantizationContext:
         
         # KV Cache strategy
         strategy = QuantizationStrategyFactory.create_kv_cache_strategy(quant_cfg.kv_cache.dtype)
+        strategy.configure(diffulex_config=config)
         ctx.set_strategy('kv_cache', strategy)
 
         # Linear strategies (weights + activations) by kind
@@ -67,12 +68,14 @@ def create_from_config(config) -> QuantizationContext:
             weight_dtype=quant_cfg.weights.linear_attn_dtype,
             act_dtype=quant_cfg.activations.linear_attn_dtype,
         )
+        linear_attn.configure(diffulex_config=config)
         ctx.set_linear_strategy("attn", linear_attn)
 
         linear_mlp = _create_linear_strategy(
             weight_dtype=quant_cfg.weights.linear_mlp_dtype,
             act_dtype=quant_cfg.activations.linear_mlp_dtype,
         )
+        linear_mlp.configure(diffulex_config=config)
         ctx.set_linear_strategy("mlp", linear_mlp)
         
         # Future: Weight strategy
diff --git a/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py b/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py
index c2ff1ce..ceb3630 100644
--- a/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py
+++ b/diffulex/utils/quantization/strategies/linear_marlin_int8_w8a16.py
@@ -13,7 +13,6 @@
 
 from __future__ import annotations
 
-import os
 from typing import Any, Optional
 
 import torch
@@ -62,10 +61,26 @@ def __init__(self) -> None:
         self._weight_cache: dict[int, tuple[torch.Tensor, torch.Tensor]] = {}
         # Cache device info and thresholds to reduce per-call CPU overhead.
         self._sm_info_cache: dict[int, tuple[int, int]] = {}
-        self._cublas_m_thr: int = self._cublas_m_threshold()
+        self._quant_block_n: int = 256
+        self._cublas_m_thr: int = 256
         # One-time availability check (avoid calling `_allspark_is_available()` on every linear).
         self._allspark_available: bool = _allspark_is_available()
 
+    def configure(self, *, diffulex_config: Any | None = None) -> None:
+        # Prefer explicit config fields over environment-variable based tuning.
+        if diffulex_config is None:
+            return
+        try:
+            bn = int(getattr(diffulex_config, "linear_w8a16_quant_block_n", self._quant_block_n))
+            self._quant_block_n = max(1, bn)
+        except Exception:
+            pass
+        try:
+            thr = int(getattr(diffulex_config, "linear_w8a16_allspark_cublas_m_threshold", self._cublas_m_thr))
+            self._cublas_m_thr = max(1, thr)
+        except Exception:
+            pass
+
     @property
     def name(self) -> str:
         # NOTE: Keep strategy naming consistent with the public W8A16 INT8 path.
@@ -158,11 +173,7 @@ def quantize_weight_for_kernel(
         # Avoid allocating a full [N,K] fp32 copy (and an extra transpose buffer).
         # Quantize in small row blocks and (when using AllSpark) write directly into
         # the repack input layout B_kn=[K,N], so we never materialize q_u8 + transpose.
-        try:
-            block_n = int(os.getenv("DIFFULEX_W8A16_QUANT_BLOCK_N", "256"))
-        except Exception:
-            block_n = 256
-        block_n = max(1, block_n)
+        block_n = max(1, int(self._quant_block_n))
 
         if self._allspark_available:
             # AllSpark repack expects B in (K,N) contiguous layout.
@@ -234,14 +245,6 @@ def _get_sm_info(self, device: torch.device) -> tuple[int, int]:
             self._sm_info_cache[idx] = (0, 0)
             return 0, 0
 
-    def _cublas_m_threshold(self) -> int:
-        # For decode, M is typically small, so AllSpark custom kernel is preferred.
-        # For large-M prefill, AllSpark falls back to a dequant+cuBLAS path if M > threshold.
-        try:
-            return int(os.getenv("DIFFULEX_ALLSPARK_CUBLAS_M_THRESHOLD", "256"))
-        except Exception:
-            return 256
-
     def linear_forward(
         self,
         x: torch.Tensor,
diff --git a/diffulex/utils/quantization/strategy.py b/diffulex/utils/quantization/strategy.py
index a36e553..7c3b01a 100644
--- a/diffulex/utils/quantization/strategy.py
+++ b/diffulex/utils/quantization/strategy.py
@@ -84,6 +84,16 @@ def get_scale_shape(self, original_shape: tuple[int, ...], **kwargs) -> tuple[in
         """
         pass
 
+    def configure(self, *, diffulex_config: Any | None = None) -> None:
+        """Optional hook to configure a strategy from Diffulex `Config`.
+
+        We intentionally keep this a no-op by default to avoid forcing configuration
+        plumbing through every call site. Strategy-specific tuning knobs should be
+        surfaced via explicit fields on `diffulex.config.Config`, not environment variables.
+        """
+        _ = diffulex_config
+        return
+
     # ---- Optional capability flags / helpers (non-abstract) ----
     # These helpers are used to avoid hard-coding isinstance(...) checks in the runtime.
     @property
diff --git a/diffulex_bench/configs/awq_bf16kv_static.yml b/diffulex_bench/configs/awq_bf16kv_static.yml
deleted file mode 100644
index 4cdb2fa..0000000
--- a/diffulex_bench/configs/awq_bf16kv_static.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# AWQ (W4A16) + BF16 KV Cache (static mode)
-engine:
-  model_path: "/data1/lzx/Dream-v0-Base-7B-awq"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: AWQ (W4A16) + BF16 activations + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "static"
-  linear_attn_weight_dtype: "awq"
-  linear_mlp_weight_dtype: "awq"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_static/awq_bf16kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/awq_bf16kv_varlen.yml b/diffulex_bench/configs/awq_bf16kv_varlen.yml
deleted file mode 100644
index 6ae2e46..0000000
--- a/diffulex_bench/configs/awq_bf16kv_varlen.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# AWQ (W4A16) + BF16 KV Cache (varlen mode)
-engine:
-  model_path: "/data1/lzx/Dream-v0-Base-7B-awq"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # Required for varlen mode
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: AWQ (W4A16) + BF16 activations + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "varlen"
-  linear_attn_weight_dtype: "awq"
-  linear_mlp_weight_dtype: "awq"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_varlen/awq_bf16kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml b/diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml
deleted file mode 100644
index c27e4ec..0000000
--- a/diffulex_bench/configs/awq_marlin_bf16kv_varlen.yml
+++ /dev/null
@@ -1,48 +0,0 @@
-# AWQ Marlin (W4, A16) + BF16 KV Cache (varlen mode)
-engine:
-  model_path: "/data1/lzx/Dream-v0-Base-7B-awq"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # Required for varlen mode
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: AWQ Marlin + BF16 activations + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "varlen"
-  linear_attn_weight_dtype: "awq_marlin"
-  linear_mlp_weight_dtype: "awq_marlin"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_varlen/awq_marlin_bf16kv"
-  save_results: true
-  use_tqdm: true
-
diff --git a/diffulex_bench/configs/bf16_bf16kv_distinct.yml b/diffulex_bench/configs/bf16_bf16kv_distinct.yml
deleted file mode 100644
index 5cf750c..0000000
--- a/diffulex_bench/configs/bf16_bf16kv_distinct.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# BF16 + BF16 KV Cache (distinct layout)
-engine:
-  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # CUDA graph not implemented yet for DiffusionLM
-  kv_cache_layout: "distinct"  # Test distinct layout
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: BF16 weights + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "static"
-  linear_attn_weight_dtype: "bf16"
-  linear_mlp_weight_dtype: "bf16"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10  # 10 samples for testing
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_distinct/bf16_bf16kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/bf16_bf16kv_static.yml b/diffulex_bench/configs/bf16_bf16kv_static.yml
deleted file mode 100644
index d36e39d..0000000
--- a/diffulex_bench/configs/bf16_bf16kv_static.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# BF16 + BF16 KV Cache (static mode)
-engine:
-  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # CUDA graph not implemented yet for DiffusionLM
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: BF16 weights + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "static"
-  linear_attn_weight_dtype: "bf16"
-  linear_mlp_weight_dtype: "bf16"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_static/bf16_bf16kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/bf16_bf16kv_varlen.yml b/diffulex_bench/configs/bf16_bf16kv_varlen.yml
deleted file mode 100644
index 8258035..0000000
--- a/diffulex_bench/configs/bf16_bf16kv_varlen.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# BF16 + BF16 KV Cache (varlen mode)
-engine:
-  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # Required for varlen mode
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: BF16 weights + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "varlen"
-  linear_attn_weight_dtype: "bf16"
-  linear_mlp_weight_dtype: "bf16"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_varlen/bf16_bf16kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/bf16_fp8kv_distinct.yml b/diffulex_bench/configs/bf16_fp8kv_distinct.yml
deleted file mode 100644
index bc0fdd5..0000000
--- a/diffulex_bench/configs/bf16_fp8kv_distinct.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# BF16 + FP8 KV Cache (distinct layout)
-engine:
-  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # CUDA graph not implemented yet for DiffusionLM
-  kv_cache_layout: "distinct"  # Test distinct layout
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: BF16 weights + FP8 KV cache
-  kv_cache_dtype: "fp8_e4m3"
-  decode_mode: "static"
-  linear_attn_weight_dtype: "bf16"
-  linear_mlp_weight_dtype: "bf16"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10  # 10 samples for testing
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_distinct/bf16_fp8kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/bf16_fp8kv_static.yml b/diffulex_bench/configs/bf16_fp8kv_static.yml
deleted file mode 100644
index ee0af7f..0000000
--- a/diffulex_bench/configs/bf16_fp8kv_static.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# BF16 + FP8 KV Cache (static mode)
-engine:
-  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # CUDA graph not implemented yet for DiffusionLM
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: BF16 weights + FP8 KV cache
-  kv_cache_dtype: "fp8_e4m3"
-  decode_mode: "static"
-  linear_attn_weight_dtype: "bf16"
-  linear_mlp_weight_dtype: "bf16"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_static/bf16_fp8kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/bf16_fp8kv_varlen.yml b/diffulex_bench/configs/bf16_fp8kv_varlen.yml
deleted file mode 100644
index 973ec91..0000000
--- a/diffulex_bench/configs/bf16_fp8kv_varlen.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# BF16 + FP8 KV Cache (varlen mode)
-engine:
-  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # Required for varlen mode
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: BF16 weights + FP8 KV cache
-  kv_cache_dtype: "fp8_e4m3"
-  decode_mode: "varlen"
-  linear_attn_weight_dtype: "bf16"
-  linear_mlp_weight_dtype: "bf16"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_varlen/bf16_fp8kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/fp8_bf16kv_varlen.yml b/diffulex_bench/configs/fp8_bf16kv_varlen.yml
deleted file mode 100644
index f6fb081..0000000
--- a/diffulex_bench/configs/fp8_bf16kv_varlen.yml
+++ /dev/null
@@ -1,48 +0,0 @@
-# FP8 Linear (vLLM) + BF16 KV Cache (varlen mode)
-engine:
-  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # Required for varlen mode
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: FP8 weights (vLLM ops) + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "varlen"
-  linear_attn_weight_dtype: "fp8"
-  linear_mlp_weight_dtype: "fp8"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_varlen/fp8_bf16kv"
-  save_results: true
-  use_tqdm: true
-
diff --git a/diffulex_bench/configs/gptq_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_bf16kv_varlen.yml
deleted file mode 100644
index 3ff8759..0000000
--- a/diffulex_bench/configs/gptq_bf16kv_varlen.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# GPTQ (W4A16) + BF16 KV Cache (varlen mode)
-engine:
-  model_path: "/data1/lzx/Dream-v0-Base-7B-gptq"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # Required for varlen mode
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: GPTQ (W4A16) + BF16 activations + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "varlen"
-  linear_attn_weight_dtype: "gptq"
-  linear_mlp_weight_dtype: "gptq"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_varlen/gptq_bf16kv"
-  save_results: true
-  use_tqdm: true
\ No newline at end of file
diff --git a/diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml b/diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml
deleted file mode 100644
index 4eb16cd..0000000
--- a/diffulex_bench/configs/gptq_bf16kv_varlen_tp2.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# GPTQ (W4A16) + BF16 KV Cache (varlen mode)
-engine:
-  model_path: "/data1/lzx/Dream-v0-Base-7B-gptq"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 2
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # Required for varlen mode
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: GPTQ (W4A16) + BF16 activations + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "varlen"
-  linear_attn_weight_dtype: "gptq"
-  linear_mlp_weight_dtype: "gptq"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_varlen/gptq_bf16kv"
-  save_results: true
-  use_tqdm: true
\ No newline at end of file
diff --git a/diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml
deleted file mode 100644
index 06d9733..0000000
--- a/diffulex_bench/configs/gptq_marlin_bf16kv_varlen.yml
+++ /dev/null
@@ -1,48 +0,0 @@
-# GPTQ Marlin (W4/W8, A16) + BF16 KV Cache (varlen mode)
-engine:
-  model_path: "/data1/lzx/Dream-v0-Base-7B-gptq"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # Required for varlen mode
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: GPTQ Marlin + BF16 activations + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "varlen"
-  linear_attn_weight_dtype: "gptq_marlin"
-  linear_mlp_weight_dtype: "gptq_marlin"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_varlen/gptq_marlin_bf16kv"
-  save_results: true
-  use_tqdm: true
-
diff --git a/diffulex_bench/configs/gptq_marlin_w4_bf16kv_static.yml b/diffulex_bench/configs/gptq_marlin_w4_bf16kv_static.yml
deleted file mode 100644
index 8ba23c3..0000000
--- a/diffulex_bench/configs/gptq_marlin_w4_bf16kv_static.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# GPTQ Marlin (W4, A16) + BF16 KV Cache (static mode)
-engine:
-  model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-marlin-w4"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: false  # Enable CUDA Graph for static mode
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: GPTQ Marlin (W4) + BF16 activations + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "static"
-  linear_attn_weight_dtype: "gptq_marlin"
-  linear_mlp_weight_dtype: "gptq_marlin"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_static/gptq_marlin_w4_bf16kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml
deleted file mode 100644
index 3702baf..0000000
--- a/diffulex_bench/configs/gptq_marlin_w4_bf16kv_varlen.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# GPTQ Marlin (W4, A16) + BF16 KV Cache (varlen mode)
-engine:
-  model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-marlin-w4"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # Required for varlen mode
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: GPTQ Marlin (W4) + BF16 activations + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "varlen"
-  linear_attn_weight_dtype: "gptq_marlin"
-  linear_mlp_weight_dtype: "gptq_marlin"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_varlen/gptq_marlin_w4_bf16kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/gptq_marlin_w8_bf16kv_static.yml b/diffulex_bench/configs/gptq_marlin_w8_bf16kv_static.yml
deleted file mode 100644
index 06bb08b..0000000
--- a/diffulex_bench/configs/gptq_marlin_w8_bf16kv_static.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# GPTQ Marlin (W8, A16) + BF16 KV Cache (static mode)
-engine:
-  model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-marlin-w8"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: GPTQ Marlin (W8) + BF16 activations + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "static"
-  linear_attn_weight_dtype: "gptq_marlin"
-  linear_mlp_weight_dtype: "gptq_marlin"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_static/gptq_marlin_w8_bf16kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml
deleted file mode 100644
index da2cfdc..0000000
--- a/diffulex_bench/configs/gptq_marlin_w8_bf16kv_varlen.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# GPTQ Marlin (W8, A16) + BF16 KV Cache (varlen mode)
-engine:
-  model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-marlin-w8"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # Required for varlen mode
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: GPTQ Marlin (W8) + BF16 activations + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "varlen"
-  linear_attn_weight_dtype: "gptq_marlin"
-  linear_mlp_weight_dtype: "gptq_marlin"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_varlen/gptq_marlin_w8_bf16kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml
deleted file mode 100644
index 0e60faa..0000000
--- a/diffulex_bench/configs/gptq_w2_bf16kv_varlen.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# GPTQ (W2A16) + BF16 KV Cache (varlen mode)
-engine:
-  model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-w2"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # Required for varlen mode
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: GPTQ (W2A16) + BF16 activations + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "varlen"
-  linear_attn_weight_dtype: "gptq"
-  linear_mlp_weight_dtype: "gptq"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_varlen/gptq_w2_bf16kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml b/diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml
deleted file mode 100644
index b1bf8ad..0000000
--- a/diffulex_bench/configs/gptq_w8_bf16kv_varlen.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# GPTQ (W8A16) + BF16 KV Cache (varlen mode)
-engine:
-  model_path: "/data1/lzx/Dream-v0-Base-7B-gptq-w8"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # Required for varlen mode
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: GPTQ (W8A16) + BF16 activations + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "varlen"
-  linear_attn_weight_dtype: "gptq"
-  linear_mlp_weight_dtype: "gptq"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_varlen/gptq_w8_bf16kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/w4a16_bf16kv_static.yml b/diffulex_bench/configs/w4a16_bf16kv_static.yml
deleted file mode 100644
index c8a2d95..0000000
--- a/diffulex_bench/configs/w4a16_bf16kv_static.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# W4A16 + BF16 KV Cache (static mode)
-engine:
-  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # CUDA graph not implemented yet for DiffusionLM
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: INT4 weights + BF16 activations + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "static"
-  linear_attn_weight_dtype: "int4"
-  linear_mlp_weight_dtype: "int4"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_static/w4a16_bf16kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/w4a16_bf16kv_varlen.yml b/diffulex_bench/configs/w4a16_bf16kv_varlen.yml
deleted file mode 100644
index 609dd3d..0000000
--- a/diffulex_bench/configs/w4a16_bf16kv_varlen.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# W4A16 + BF16 KV Cache (varlen mode)
-engine:
-  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # Required for varlen mode
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: INT4 weights + BF16 activations + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "varlen"
-  linear_attn_weight_dtype: "int4"
-  linear_mlp_weight_dtype: "int4"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_varlen/w4a16_bf16kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/w4a16_fp8kv_static.yml b/diffulex_bench/configs/w4a16_fp8kv_static.yml
deleted file mode 100644
index 8f707a6..0000000
--- a/diffulex_bench/configs/w4a16_fp8kv_static.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# W4A16 + FP8 KV Cache (static mode)
-engine:
-  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # CUDA graph not implemented yet for DiffusionLM
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: INT4 weights + BF16 activations + FP8 KV cache
-  kv_cache_dtype: "fp8_e4m3"
-  decode_mode: "static"
-  linear_attn_weight_dtype: "int4"
-  linear_mlp_weight_dtype: "int4"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_static/w4a16_fp8kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/w4a16_fp8kv_varlen.yml b/diffulex_bench/configs/w4a16_fp8kv_varlen.yml
deleted file mode 100644
index bf7381b..0000000
--- a/diffulex_bench/configs/w4a16_fp8kv_varlen.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# W4A16 + FP8 KV Cache (varlen mode)
-engine:
-  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # Required for varlen mode
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: INT4 weights + BF16 activations + FP8 KV cache
-  kv_cache_dtype: "fp8_e4m3"
-  decode_mode: "varlen"
-  linear_attn_weight_dtype: "int4"
-  linear_mlp_weight_dtype: "int4"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_varlen/w4a16_fp8kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/w4a8_bf16kv_static.yml b/diffulex_bench/configs/w4a8_bf16kv_static.yml
deleted file mode 100644
index 4741aa5..0000000
--- a/diffulex_bench/configs/w4a8_bf16kv_static.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# W4A8 + BF16 KV Cache (static mode)
-engine:
-  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # CUDA graph not implemented yet for DiffusionLM
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: INT4 weights + INT8 activations + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "static"
-  linear_attn_weight_dtype: "int4"
-  linear_mlp_weight_dtype: "int4"
-  linear_attn_act_dtype: "int8"
-  linear_mlp_act_dtype: "int8"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_static/w4a8_bf16kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/w4a8_bf16kv_varlen.yml b/diffulex_bench/configs/w4a8_bf16kv_varlen.yml
deleted file mode 100644
index 8ce0145..0000000
--- a/diffulex_bench/configs/w4a8_bf16kv_varlen.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# W4A8 + BF16 KV Cache (varlen mode)
-engine:
-  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # Required for varlen mode
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: INT4 weights + INT8 activations + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "varlen"
-  linear_attn_weight_dtype: "int4"
-  linear_mlp_weight_dtype: "int4"
-  linear_attn_act_dtype: "int8"
-  linear_mlp_act_dtype: "int8"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_varlen/w4a8_bf16kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/w4a8_fp8kv_static.yml b/diffulex_bench/configs/w4a8_fp8kv_static.yml
deleted file mode 100644
index 08da846..0000000
--- a/diffulex_bench/configs/w4a8_fp8kv_static.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# W4A8 + FP8 KV Cache (static mode)
-engine:
-  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # CUDA graph not implemented yet for DiffusionLM
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: INT4 weights + INT8 activations + FP8 KV cache
-  kv_cache_dtype: "fp8_e4m3"
-  decode_mode: "static"
-  linear_attn_weight_dtype: "int4"
-  linear_mlp_weight_dtype: "int4"
-  linear_attn_act_dtype: "int8"
-  linear_mlp_act_dtype: "int8"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_static/w4a8_fp8kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/w4a8_fp8kv_varlen.yml b/diffulex_bench/configs/w4a8_fp8kv_varlen.yml
deleted file mode 100644
index 8dd80ec..0000000
--- a/diffulex_bench/configs/w4a8_fp8kv_varlen.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# W4A8 + FP8 KV Cache (varlen mode)
-engine:
-  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # Required for varlen mode
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: INT4 weights + INT8 activations + FP8 KV cache
-  kv_cache_dtype: "fp8_e4m3"
-  decode_mode: "varlen"
-  linear_attn_weight_dtype: "int4"
-  linear_mlp_weight_dtype: "int4"
-  linear_attn_act_dtype: "int8"
-  linear_mlp_act_dtype: "int8"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_varlen/w4a8_fp8kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/w8a16_bf16kv_static.yml b/diffulex_bench/configs/w8a16_bf16kv_static.yml
deleted file mode 100644
index 7f54b1c..0000000
--- a/diffulex_bench/configs/w8a16_bf16kv_static.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# W8A16 + BF16 KV Cache (static mode)
-engine:
-  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # CUDA graph not implemented yet for DiffusionLM
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: INT8 weights + BF16 activations + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "static"
-  linear_attn_weight_dtype: "int8"
-  linear_mlp_weight_dtype: "int8"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_static/w8a16_bf16kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/w8a16_bf16kv_varlen.yml b/diffulex_bench/configs/w8a16_bf16kv_varlen.yml
deleted file mode 100644
index 9c0efaa..0000000
--- a/diffulex_bench/configs/w8a16_bf16kv_varlen.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# W8A16 + BF16 KV Cache (varlen mode)
-engine:
-  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # Required for varlen mode
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: INT8 weights + BF16 activations + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "varlen"
-  linear_attn_weight_dtype: "int8"
-  linear_mlp_weight_dtype: "int8"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_varlen/w8a16_bf16kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/w8a16_fp8kv_static.yml b/diffulex_bench/configs/w8a16_fp8kv_static.yml
deleted file mode 100644
index 27243b9..0000000
--- a/diffulex_bench/configs/w8a16_fp8kv_static.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# W8A16 + FP8 KV Cache (static mode)
-engine:
-  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # CUDA graph not implemented yet for DiffusionLM
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: INT8 weights + BF16 activations + FP8 KV cache
-  kv_cache_dtype: "fp8_e4m3"
-  decode_mode: "static"
-  linear_attn_weight_dtype: "int8"
-  linear_mlp_weight_dtype: "int8"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_static/w8a16_fp8kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/w8a16_fp8kv_varlen.yml b/diffulex_bench/configs/w8a16_fp8kv_varlen.yml
deleted file mode 100644
index ddd04ab..0000000
--- a/diffulex_bench/configs/w8a16_fp8kv_varlen.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# W8A16 + FP8 KV Cache (varlen mode)
-engine:
-  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # Required for varlen mode
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: INT8 weights + BF16 activations + FP8 KV cache
-  kv_cache_dtype: "fp8_e4m3"
-  decode_mode: "varlen"
-  linear_attn_weight_dtype: "int8"
-  linear_mlp_weight_dtype: "int8"
-  linear_attn_act_dtype: "bf16"
-  linear_mlp_act_dtype: "bf16"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_varlen/w8a16_fp8kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/w8a8_bf16kv_static.yml b/diffulex_bench/configs/w8a8_bf16kv_static.yml
deleted file mode 100644
index e34456c..0000000
--- a/diffulex_bench/configs/w8a8_bf16kv_static.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# W8A8 + BF16 KV Cache (static mode)
-engine:
-  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # CUDA graph not implemented yet for DiffusionLM
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: INT8 weights + INT8 activations + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "static"
-  linear_attn_weight_dtype: "int8"
-  linear_mlp_weight_dtype: "int8"
-  linear_attn_act_dtype: "int8"
-  linear_mlp_act_dtype: "int8"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_static/w8a8_bf16kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/w8a8_bf16kv_varlen.yml b/diffulex_bench/configs/w8a8_bf16kv_varlen.yml
deleted file mode 100644
index 57e919b..0000000
--- a/diffulex_bench/configs/w8a8_bf16kv_varlen.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# W8A8 + BF16 KV Cache (varlen mode)
-engine:
-  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.5
-  max_model_len: 4096
-  max_num_batched_tokens: 2048
-  max_num_seqs: 64
-  
-  enforce_eager: true  # Required for varlen mode
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: INT8 weights + INT8 activations + BF16 KV cache
-  kv_cache_dtype: "bf16"
-  decode_mode: "varlen"
-  linear_attn_weight_dtype: "int8"
-  linear_mlp_weight_dtype: "int8"
-  linear_attn_act_dtype: "int8"
-  linear_mlp_act_dtype: "int8"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_varlen/w8a8_bf16kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/w8a8_fp8kv_static.yml b/diffulex_bench/configs/w8a8_fp8kv_static.yml
deleted file mode 100644
index da5b9c6..0000000
--- a/diffulex_bench/configs/w8a8_fp8kv_static.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# W8A8 + FP8 KV Cache (static mode)
-engine:
-  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # CUDA graph not implemented yet for DiffusionLM
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: INT8 weights + INT8 activations + FP8 KV cache
-  kv_cache_dtype: "fp8_e4m3"
-  decode_mode: "static"
-  linear_attn_weight_dtype: "int8"
-  linear_mlp_weight_dtype: "int8"
-  linear_attn_act_dtype: "int8"
-  linear_mlp_act_dtype: "int8"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_static/w8a8_fp8kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_bench/configs/w8a8_fp8kv_varlen.yml b/diffulex_bench/configs/w8a8_fp8kv_varlen.yml
deleted file mode 100644
index 1ae985b..0000000
--- a/diffulex_bench/configs/w8a8_fp8kv_varlen.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-# W8A8 + FP8 KV Cache (varlen mode)
-engine:
-  model_path: "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
-  tokenizer_path: null
-  model_name: "dream"
-  decoding_strategy: "d2f"
-  mask_token_id: 151666
-  
-  use_lora: false
-  lora_path: ""
-  
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  
-  gpu_memory_utilization: 0.7
-  max_model_len: 4096
-  max_num_batched_tokens: 4096
-  max_num_seqs: 128
-  
-  enforce_eager: true  # Required for varlen mode
-  kv_cache_layout: "unified"
-  
-  accept_threshold: 0.9
-  complete_threshold: 0.95
-  add_new_block_threshold: 0.1
-  diffusion_block_size: 32
-  
-  # Quantization: INT8 weights + INT8 activations + FP8 KV cache
-  kv_cache_dtype: "fp8_e4m3"
-  decode_mode: "varlen"
-  linear_attn_weight_dtype: "int8"
-  linear_mlp_weight_dtype: "int8"
-  linear_attn_act_dtype: "int8"
-  linear_mlp_act_dtype: "int8"
-
-eval:
-  dataset_name: "gsm8k"
-  dataset_split: "test"
-  dataset_limit: 10
-  
-  temperature: 0.0
-  max_tokens: 512
-  ignore_eos: false
-  
-  output_dir: "benchmark_results_varlen/w8a8_fp8kv"
-  save_results: true
-  use_tqdm: true
diff --git a/diffulex_legacy/config.py b/diffulex_legacy/config.py
index bd4ec71..a5b1dd6 100755
--- a/diffulex_legacy/config.py
+++ b/diffulex_legacy/config.py
@@ -29,7 +29,7 @@ class Config:
     # Distributed comm (per tensor-parallel group). When using multiple DP
     # replicas on one host, assign unique master_port per replica.
     master_addr: str = "localhost"
-    master_port: int = int(os.environ.get("DIFFULEX_MASTER_PORT", "2333"))
+    master_port: int = 2333
     # Shared memory segment name for intra-TP RPC; must be unique per DP group.
     shm_name: str = "d2f_vllm"
     # Start device index for this TP group (set by DP launcher).
diff --git a/diffulex_legacy/engine/dp_engine.py b/diffulex_legacy/engine/dp_engine.py
index 70f8e82..b9da2b7 100755
--- a/diffulex_legacy/engine/dp_engine.py
+++ b/diffulex_legacy/engine/dp_engine.py
@@ -115,12 +115,10 @@ def __init__(self, model, **kwargs):
         need_gpus = self.dp_size * cfg.tensor_parallel_size
         assert len(vis) >= need_gpus, f"Require {need_gpus} GPUs (dp={self.dp_size}, tp={cfg.tensor_parallel_size}), visible {len(vis)}"
 
-        # Optional overrides: kwargs['device_ids'] or env D2F_DEVICE_MAP
+        # Optional overrides: kwargs['device_ids']
         override = None
         if 'device_ids' in kwargs and kwargs['device_ids']:
             override = list(kwargs['device_ids'])
-        elif os.environ.get('D2F_DEVICE_MAP'):
-            override = [int(x) for x in os.environ['D2F_DEVICE_MAP'].split(',') if x.strip() != '']
         if override is not None:
             assert len(override) >= need_gpus, f"device_ids length {len(override)} < required {need_gpus}"
             # All override devices must be in visible list
diff --git a/diffulex_profiler/example.py b/diffulex_profiler/example.py
index 8982990..64e07f5 100644
--- a/diffulex_profiler/example.py
+++ b/diffulex_profiler/example.py
@@ -67,7 +67,16 @@ def example_multiple_sections():
     
     # Profile model loading
     with profiler.profile("model_loading"):
-        llm = Diffulex(model_path, model_name="dream", ...)
+        model_path = "/path/to/your/model"
+        llm = Diffulex(
+            model_path,
+            model_name="dream",
+            tensor_parallel_size=1,
+            data_parallel_size=1,
+            gpu_memory_utilization=0.25,
+            max_model_len=2048,
+            decoding_strategy="d2f",
+        )
     
     # Profile prefill
     prompts = ["Prompt 1", "Prompt 2"]
diff --git a/examples/test_dream_diffulex_gsm8k.py b/examples/test_dream_diffulex_gsm8k.py
index de3a2aa..e15d95d 100755
--- a/examples/test_dream_diffulex_gsm8k.py
+++ b/examples/test_dream_diffulex_gsm8k.py
@@ -64,4 +64,5 @@
           "=*=" * 30)
     for idx, o in enumerate(outputs):
         print("\n", "=*=" * 30)
-        print(f"[Prompt {idx} Result] \n{prompts[idx] + "\n-----<Start-of-Response>-----\n" + o['text']}\n")
\ No newline at end of file
+        resp = prompts[idx] + "\n-----<Start-of-Response>-----\n" + o["text"]
+        print(f"[Prompt {idx} Result]\n{resp}\n")
\ No newline at end of file
diff --git a/examples/test_dream_dvllm_human_eval.py b/examples/test_dream_dvllm_human_eval.py
index 2d95f00..9e72be6 100755
--- a/examples/test_dream_dvllm_human_eval.py
+++ b/examples/test_dream_dvllm_human_eval.py
@@ -84,4 +84,5 @@ def summarize_profiling(csv_path: str) -> dict:
           "=*=" * 30)
     for idx, o in enumerate(outputs):
         print("\n", "=*=" * 30)
-        print(f"[Prompt {idx} Result] \n{prompts[idx] + "\n-----<Start-of-Response>-----\n" + o['text']}\n")
\ No newline at end of file
+        resp = prompts[idx] + "\n-----<Start-of-Response>-----\n" + o["text"]
+        print(f"[Prompt {idx} Result]\n{resp}\n")
\ No newline at end of file
diff --git a/examples/test_fastdllmv2_diffulex_gsm8k.py b/examples/test_fastdllmv2_diffulex_gsm8k.py
index 02217b2..1fc1860 100755
--- a/examples/test_fastdllmv2_diffulex_gsm8k.py
+++ b/examples/test_fastdllmv2_diffulex_gsm8k.py
@@ -86,4 +86,5 @@ def summarize_profiling(csv_path: str) -> dict:
           "=*=" * 30)
     for idx, o in enumerate(outputs):
         print("\n", "=*=" * 30)
-        print(f"[Prompt {idx} Result] \n{prompts[idx] + "\n-----<Start-of-Response>-----\n" + o['text']}\n")
\ No newline at end of file
+        resp = prompts[idx] + "\n-----<Start-of-Response>-----\n" + o["text"]
+        print(f"[Prompt {idx} Result]\n{resp}\n")
\ No newline at end of file
diff --git a/examples/test_gptq_awq_loading.py b/examples/test_gptq_awq_loading.py
index a9a40fa..3cb8eed 100644
--- a/examples/test_gptq_awq_loading.py
+++ b/examples/test_gptq_awq_loading.py
@@ -25,23 +25,6 @@
 except Exception:
     pass
 
-# 自动设置 CUDA 12.2 路径（如果存在）
-_CUDA_12_2_PATH = Path("/home/lzx/cuda-12.2")
-if _CUDA_12_2_PATH.exists():
-    os.environ["CUDA_HOME"] = str(_CUDA_12_2_PATH)
-    os.environ["CUDA_PATH"] = str(_CUDA_12_2_PATH)
-    os.environ["PATH"] = f"{_CUDA_12_2_PATH}/bin:{os.environ.get('PATH', '')}"
-    os.environ["LD_LIBRARY_PATH"] = f"{_CUDA_12_2_PATH}/lib64:{os.environ.get('LD_LIBRARY_PATH', '')}"
-    os.environ["LIBRARY_PATH"] = f"{_CUDA_12_2_PATH}/lib64:{os.environ.get('LIBRARY_PATH', '')}"
-    os.environ["CPATH"] = f"{_CUDA_12_2_PATH}/include:{os.environ.get('CPATH', '')}"
-    os.environ["CUDACXX"] = str(_CUDA_12_2_PATH / "bin" / "nvcc")
-    print(f"[INFO] 已自动设置 CUDA 路径: {_CUDA_12_2_PATH}")
-
-# 设置使用 GPU1（如果 GPU0 被占用）
-if "CUDA_VISIBLE_DEVICES" not in os.environ:
-    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
-    print(f"[INFO] 已设置 CUDA_VISIBLE_DEVICES=1（使用 GPU1）")
-
 # 确保从当前仓库导入
 _REPO_ROOT = Path(__file__).resolve().parents[1]
 if str(_REPO_ROOT) not in sys.path:
diff --git a/examples/test_llada_dvllm_human_eval.py b/examples/test_llada_dvllm_human_eval.py
index 5e3608f..1fdb723 100755
--- a/examples/test_llada_dvllm_human_eval.py
+++ b/examples/test_llada_dvllm_human_eval.py
@@ -83,4 +83,5 @@ def summarize_profiling(csv_path: str) -> dict:
           "=*=" * 30)
     for idx, o in enumerate(outputs):
         print("\n", "=*=" * 30)
-        print(f"[Prompt {idx} Result] \n{prompts[idx] + "\n-----<Start-of-Response>-----\n" + o['text']}\n")
\ No newline at end of file
+        resp = prompts[idx] + "\n-----<Start-of-Response>-----\n" + o["text"]
+        print(f"[Prompt {idx} Result]\n{resp}\n")
\ No newline at end of file
diff --git a/examples/test_quantization_generation.py b/examples/test_quantization_generation.py
index 22aaebc..7ffd26f 100755
--- a/examples/test_quantization_generation.py
+++ b/examples/test_quantization_generation.py
@@ -82,23 +82,13 @@
 except Exception:
     pass
 
-# 自动设置 CUDA 12.2 路径（如果存在）
-_CUDA_12_2_PATH = Path("/home/lzx/cuda-12.2")
-if _CUDA_12_2_PATH.exists():
-    os.environ["CUDA_HOME"] = str(_CUDA_12_2_PATH)
-    # Some toolchains probe CUDA_PATH instead of CUDA_HOME.
-    os.environ["CUDA_PATH"] = str(_CUDA_12_2_PATH)
-    os.environ["PATH"] = f"{_CUDA_12_2_PATH}/bin:{os.environ.get('PATH', '')}"
-    os.environ["LD_LIBRARY_PATH"] = f"{_CUDA_12_2_PATH}/lib64:{os.environ.get('LD_LIBRARY_PATH', '')}"
-    os.environ["LIBRARY_PATH"] = f"{_CUDA_12_2_PATH}/lib64:{os.environ.get('LIBRARY_PATH', '')}"
-    os.environ["CPATH"] = f"{_CUDA_12_2_PATH}/include:{os.environ.get('CPATH', '')}"
-    os.environ["CUDACXX"] = str(_CUDA_12_2_PATH / "bin" / "nvcc")
-    print(f"[INFO] 已自动设置 CUDA 路径: {_CUDA_12_2_PATH}")
-
-# 设置使用 GPU1（如果 GPU0 被占用）
-if "CUDA_VISIBLE_DEVICES" not in os.environ:
-    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
-    print(f"[INFO] 已设置 CUDA_VISIBLE_DEVICES=1（使用 GPU1）")
+#
+# NOTE:
+# 这个脚本不应假设本机 CUDA 安装路径或默认 GPU 号。
+# 如需指定 CUDA/设备，请在运行前自行设置：
+#   - CUDA_HOME / CUDA_PATH / PATH / LD_LIBRARY_PATH
+#   - CUDA_VISIBLE_DEVICES
+# 或者在你自己的 wrapper 脚本里处理。
 
 # 确保从当前仓库导入
 _REPO_ROOT = Path(__file__).resolve().parents[1]
@@ -736,7 +726,9 @@ def main():
     
     # 其他选项
     parser.add_argument('--max-tokens', type=int, default=30, help='最大生成 token 数（默认: 30）')
-    parser.add_argument('--model-path', type=str, help='模型路径（默认: 从环境变量 DIFFULEX_TEST_MODEL 读取）')
+    parser.add_argument('--model-path', type=str, required=True, help='模型路径（必填）')
+    parser.add_argument('--lora-path', type=str, default="", help='LoRA 路径（可选）')
+    parser.add_argument('--use-lora', action='store_true', help='启用 LoRA（需同时提供 --lora-path）')
     parser.add_argument('--gpu-memory-utilization', type=float, default=0.3, help='GPU 内存利用率（默认: 0.3）')
     parser.add_argument('--no-isolate', action='store_true', help='多策略运行时不使用子进程隔离（调试用，可能导致状态串扰/性能波动）')
     # Internal: emit a single JSON result line for parent process parsing.
@@ -745,10 +737,10 @@ def main():
     args = parser.parse_args()
     
     # 确定模型路径
-    model_path = args.model_path or os.getenv("DIFFULEX_TEST_MODEL", "/data1/ckpts/Dream-org/Dream-v0-Base-7B")
+    model_path = args.model_path
     if not os.path.exists(model_path):
         print(f"错误: 模型路径不存在: {model_path}")
-        print("请使用 --model-path 或设置环境变量 DIFFULEX_TEST_MODEL 指向有效的模型路径")
+        print("请使用 --model-path 指向有效的模型路径")
         return
     
     # 解析要运行的策略
@@ -786,8 +778,8 @@ def main():
     
     # 通用 Diffulex 配置
     common_kwargs = {
-        'lora_path': os.getenv("DIFFULEX_TEST_LORA", ""),
-        'use_lora': bool(os.getenv("DIFFULEX_TEST_LORA", "")),
+        'lora_path': args.lora_path,
+        'use_lora': bool(args.use_lora and args.lora_path),
         'model_name': 'dream',
         'enforce_eager': True,
         'data_parallel_size': 1,
diff --git a/examples/test_sdar_diffulex_gsm8k.py b/examples/test_sdar_diffulex_gsm8k.py
index b4f360c..5d9efe7 100755
--- a/examples/test_sdar_diffulex_gsm8k.py
+++ b/examples/test_sdar_diffulex_gsm8k.py
@@ -64,4 +64,5 @@
           "=*=" * 30)
     for idx, o in enumerate(outputs):
         print("\n", "=*=" * 30)
-        print(f"[Prompt {idx} Result] \n{prompts[idx] + "\n-----<Start-of-Response>-----\n" + o['text']}\n")
\ No newline at end of file
+        resp = prompts[idx] + "\n-----<Start-of-Response>-----\n" + o["text"]
+        print(f"[Prompt {idx} Result]\n{resp}\n")
\ No newline at end of file
diff --git a/examples/test_sdar_dvllm.py b/examples/test_sdar_dvllm.py
index 78fbbd7..4c30918 100644
--- a/examples/test_sdar_dvllm.py
+++ b/examples/test_sdar_dvllm.py
@@ -97,14 +97,14 @@ def main() -> None:
     parser.add_argument(
         "--model",
         type=str,
-        default="/home/lzx/SDAR/training/model/SDAR-1.7B-Chat",
+        required=True,
         help="SDAR HF model directory (contains config.json + model.safetensors).",
     )
     parser.add_argument("--device", type=int, default=0)
     parser.add_argument(
         "--converted-dir",
         type=str,
-        default="/home/lzx/tmp/diffulex_sdar_converted",
+        default="tmp/diffulex_sdar_converted",
         help="Output directory for converted checkpoint keys (Diffulex-native).",
     )
     parser.add_argument("--prompt", type=str, default="你好，请用一句话介绍 SDAR。")
diff --git a/profile/torch_d2f_profiler.py b/profile/torch_d2f_profiler.py
index e8d36cb..8dfcf18 100644
--- a/profile/torch_d2f_profiler.py
+++ b/profile/torch_d2f_profiler.py
@@ -40,17 +40,6 @@
 except Exception:
     pass
 
-# Optional: auto CUDA 12.2 toolchain env (align with your other scripts).
-_CUDA_12_2_PATH = Path("/home/lzx/cuda-12.2")
-if _CUDA_12_2_PATH.exists():
-    os.environ.setdefault("CUDA_HOME", str(_CUDA_12_2_PATH))
-    os.environ.setdefault("CUDA_PATH", str(_CUDA_12_2_PATH))
-    os.environ["PATH"] = f"{_CUDA_12_2_PATH}/bin:{os.environ.get('PATH', '')}"
-    os.environ["LD_LIBRARY_PATH"] = f"{_CUDA_12_2_PATH}/lib64:{os.environ.get('LD_LIBRARY_PATH', '')}"
-    os.environ["LIBRARY_PATH"] = f"{_CUDA_12_2_PATH}/lib64:{os.environ.get('LIBRARY_PATH', '')}"
-    os.environ["CPATH"] = f"{_CUDA_12_2_PATH}/include:{os.environ.get('CPATH', '')}"
-    os.environ.setdefault("CUDACXX", str(_CUDA_12_2_PATH / "bin" / "nvcc"))
-
 # Ensure import from current repo.
 _REPO_ROOT = Path(__file__).resolve().parents[1]
 if str(_REPO_ROOT) not in sys.path:
@@ -89,9 +78,10 @@ def _mkdir(p: Path) -> Path:
 def main() -> None:
     parser = argparse.ArgumentParser("Diffulex torch.profiler flamegraph (D2F/Dream)")
 
-    parser.add_argument("--model-path", type=str, default=os.getenv("DIFFULEX_TEST_MODEL", "/data1/ckpts/Dream-org/Dream-v0-Base-7B"))
-    parser.add_argument("--lora-path", type=str, default=os.getenv("DIFFULEX_TEST_LORA", ""))
-    parser.add_argument("--use-lora", action="store_true", help="启用 LoRA（需同时提供 --lora-path 或 DIFFULEX_TEST_LORA）")
+    parser.add_argument("--model-path", type=str, required=True, help="模型路径（必填）")
+    parser.add_argument("--lora-path", type=str, default="", help="LoRA 路径（可选）")
+    parser.add_argument("--use-lora", action="store_true", help="启用 LoRA（需同时提供 --lora-path）")
+    parser.add_argument("--cuda-home", type=str, default="", help="（可选）设置 CUDA_HOME/CUDA_PATH 并更新 PATH/LD_LIBRARY_PATH")
 
     parser.add_argument("--tag", type=str, default="torch_profile", help="输出文件名前缀")
     parser.add_argument("--out-dir", type=str, default="log/torch_profiles", help="输出目录（相对仓库根）")
@@ -152,6 +142,18 @@ def main() -> None:
 
     args = parser.parse_args()
 
+    if args.cuda_home:
+        cuda_home = Path(args.cuda_home)
+        if not cuda_home.exists():
+            raise FileNotFoundError(f"--cuda-home 不存在: {cuda_home}")
+        os.environ["CUDA_HOME"] = str(cuda_home)
+        os.environ["CUDA_PATH"] = str(cuda_home)
+        os.environ["PATH"] = f"{cuda_home}/bin:{os.environ.get('PATH', '')}"
+        os.environ["LD_LIBRARY_PATH"] = f"{cuda_home}/lib64:{os.environ.get('LD_LIBRARY_PATH', '')}"
+        os.environ["LIBRARY_PATH"] = f"{cuda_home}/lib64:{os.environ.get('LIBRARY_PATH', '')}"
+        os.environ["CPATH"] = f"{cuda_home}/include:{os.environ.get('CPATH', '')}"
+        os.environ["CUDACXX"] = str(cuda_home / "bin" / "nvcc")
+
     model_path = Path(args.model_path)
     if not model_path.exists():
         raise FileNotFoundError(f"模型路径不存在: {model_path}")
diff --git a/quantization_architecture.md b/quantization_architecture.md
deleted file mode 100644
index 8504bf5..0000000
--- a/quantization_architecture.md
+++ /dev/null
@@ -1,149 +0,0 @@
-# Diffulex 量化模块架构总结
-
-## 一、架构概述
-
-Diffulex的量化模块采用**策略模式（Strategy Pattern）**和**上下文管理（Context Management）**设计，支持灵活的量化策略扩展。模块主要包含以下组件：
-
-### 1. 核心组件
-
-#### 1.1 配置层 (Config)
-- **QuantizationConfig**: 顶级量化配置，包含KV cache、权重、激活的量化配置
-- **KVCacheQuantConfig**: KV cache量化配置（dtype: bf16/fp8_e4m3/fp8_e5m2）
-- **WeightQuantConfig**: 权重量化配置（支持按类型区分：attn/mlp）
-- **ActivationQuantConfig**: 激活量化配置（支持按类型区分：attn/mlp）
-
-#### 1.2 上下文管理 (Context)
-- **QuantizationContext**: 线程本地存储（Thread-Local Storage），管理量化策略实例
-  - 存储策略实例：`kv_cache`, `linear_attn`, `linear_mlp`, `linear_other`
-  - 提供激活量化缓存（step-local cache）
-  - 通过全局函数访问：`get_quantization_context()`, `get_kv_cache_strategy()`, `get_linear_strategy()`
-
-#### 1.3 工厂模式 (Factory)
-- **QuantizationStrategyFactory**: 从配置创建量化策略
-  - `create_from_config()`: 从Diffulex配置对象创建并配置量化上下文
-  - `create_kv_cache_strategy()`: 创建KV cache量化策略
-
-#### 1.4 注册表 (Registry)
-- **KV Cache策略注册表**: 通过`@register_kv_cache_strategy`装饰器注册
-- **Linear策略注册表**: 通过`@register_linear_strategy`装饰器注册（按weight_dtype + act_dtype配对）
-- 支持dtype别名和规范化（如"fp8" -> "fp8_e4m3"）
-
-#### 1.5 策略接口 (Strategy Interfaces)
-- **QuantizationStrategy**: 基础抽象类
-  - `quantize()`: 量化张量
-  - `dequantize()`: 反量化张量
-  - `get_storage_dtype()`: 获取存储数据类型
-  - `get_scale_shape()`: 获取scale张量形状
-  
-- **KVCacheQuantizationStrategy**: KV cache量化策略接口
-  - `compute_scales()`: 计算量化scale
-  - `update_scales()`: 更新量化scale（如running max策略）
-  - `init_scales()`: 初始化scale
-  - `quantize_kv_for_store()`: 量化KV用于存储
-  - `view_kv_cache_for_kernels()`: 为kernel提供视图
-
-- **LinearQuantizationStrategy**: Linear层量化策略接口
-  - `linear_forward()`: 执行量化Linear前向传播
-  - `quantize_weight_for_kernel()`: 为kernel量化权重
-  - `quantize_act_for_kernel()`: 为kernel量化激活
-
-#### 1.6 具体策略实现 (Strategy Implementations)
-
-**KV Cache策略**:
-- `KVCacheBF16Strategy`: BF16存储（无量化）
-- `KVCacheFP8RunningMaxStrategy`: FP8量化（E4M3/E5M2），使用running max管理scale
-
-**Linear策略**:
-- `LinearBF16Strategy`: BF16权重+BF16激活（无量化）
-- `LinearGPTQW4A16Strategy`: GPTQ W4权重+BF16激活
-- `LinearAWQW4A16Strategy`: AWQ W4权重+BF16激活
-- `LinearInt8W8A16Strategy`: INT8权重+BF16激活
-- `LinearInt8W8A8Strategy`: INT8权重+INT8激活
-- `LinearInt4W4A16Strategy`: INT4权重+BF16激活
-- `LinearInt4W4A8Strategy`: INT4权重+INT8激活
-- `LinearFP8W8A16Strategy`: FP8权重+BF16激活
-- `LinearFP8W8A8Strategy`: FP8权重+FP8激活
-- `LinearStubStrategy`: 占位策略（未实现的组合）
-
-#### 1.7 工具函数 (Utilities)
-- **kv_cache_dtype.py**: KV cache数据类型处理
-  - `parse_kv_cache_dtype()`: 解析dtype字符串
-  - `view_fp8_cache()`: FP8 cache视图转换
-  - `ensure_scale_tensor()`: 确保scale张量格式正确
-
-## 二、与其他模块的耦合关系
-
-### 2.1 模型运行器 (Model Runner)
-**文件**: `diffulex/engine/model_runner.py`
-- **初始化**: 在`ModelRunnerBase.__init__()`中调用`QuantizationStrategyFactory.create_from_config(config)`
-- **KV Cache分配**: 使用`get_kv_cache_strategy()`获取策略，根据策略分配KV cache存储
-
-### 2.2 Linear层
-**文件**: `diffulex/layer/linear.py`
-- **前向传播**: 在`forward()`中调用`get_linear_strategy(quant_kind)`获取策略
-- **权重量化**: 在`_maybe_quantize_loaded_weight_param()`中，加载权重后自动量化并删除BF16权重参数
-- **离线量化支持**: 支持GPTQ/AWQ离线量化权重的加载和使用
-
-### 2.3 KV Cache Kernels
-**文件**: `diffulex_kernel/python/kv_cache_kernels.py`, `diffulex_kernel/python/dllm_flash_attn_kernels.py`
-- **策略获取**: 在kernel函数中调用`get_kv_cache_strategy()`获取策略
-- **Scale管理**: 使用策略的`update_scales()`更新scale
-- **Cache视图**: 使用策略的`view_kv_cache_for_kernels()`获取适合kernel的视图
-
-### 2.4 注意力实现
-**文件**: `diffulex/attention/attn_impl.py`
-- **策略获取**: 在注意力计算中获取KV cache策略
-- **Scale传递**: 将scale传递给attention metadata
-
-### 2.5 TP Worker
-**文件**: `diffulex/engine/tp_worker.py`
-- **缓存清理**: 在每个step开始时调用`clear_act_quant_cache()`清理激活量化缓存
-
-## 三、量化流程
-
-### 3.1 初始化流程
-1. `ModelRunnerBase.__init__()` 调用 `QuantizationStrategyFactory.create_from_config(config)`
-2. Factory从config解析`QuantizationConfig`
-3. Factory创建KV cache策略和Linear策略（按attn/mlp/other分类）
-4. 策略注册到`QuantizationContext`（线程本地存储）
-
-### 3.2 KV Cache量化流程
-1. **初始化**: 调用`strategy.init_scales()`初始化scale张量
-2. **存储**: 在KV cache存储时，调用`strategy.quantize_kv_for_store()`量化K和V
-3. **更新**: 每次前向传播后，调用`strategy.update_scales()`更新running max scale
-4. **使用**: Kernel使用`strategy.view_kv_cache_for_kernels()`获取适合的视图
-
-### 3.3 Linear量化流程
-1. **权重量化**: 
-   - 在线量化：加载权重时自动调用`strategy.quantize_weight_for_kernel()`
-   - 离线量化：通过`set_offline_quantized_weight()`加载GPTQ/AWQ权重
-2. **前向传播**: 
-   - 调用`strategy.linear_forward()`执行量化计算
-   - 支持TileLang kernel加速（如GPTQ W4A16）
-   - 支持Python fallback实现
-
-### 3.4 激活量化流程（W8A8/W4A8）
-1. **缓存**: 使用`QuantizationContext`的step-local cache缓存激活量化结果
-2. **量化**: 在Linear层前向传播时，调用`strategy.quantize_act_for_kernel()`
-3. **清理**: 每个step开始时清理缓存
-
-## 四、扩展性设计
-
-### 4.1 添加新的KV Cache策略
-1. 实现`KVCacheQuantizationStrategy`接口
-2. 使用`@register_kv_cache_strategy("dtype_alias")`注册
-3. 在`strategies/__init__.py`中导入（触发注册）
-
-### 4.2 添加新的Linear策略
-1. 实现`LinearQuantizationStrategy`接口
-2. 使用`@register_linear_strategy(weight_dtype="...", act_dtype="...")`注册
-3. 在`strategies/__init__.py`中导入（触发注册）
-
-### 4.3 支持新的量化方法
-- 权重量化：GPTQ, AWQ, INT8, INT4, FP8
-- 激活量化：INT8, INT4, FP8
-- KV Cache量化：FP8 (E4M3/E5M2)
-
-## 五、架构图
-
-详见下面的Mermaid图表。
diff --git a/quantization_architecture_diagram.md b/quantization_architecture_diagram.md
deleted file mode 100644
index 5d38fea..0000000
--- a/quantization_architecture_diagram.md
+++ /dev/null
@@ -1,551 +0,0 @@
-# Diffulex 量化模块架构图
-
-## 完整架构图
-
-```mermaid
-graph TB
-    subgraph "用户配置层"
-        Config[Diffulex Config<br/>kv_cache_dtype<br/>linear_attn_weight_dtype<br/>linear_mlp_weight_dtype<br/>...]
-    end
-
-    subgraph "量化模块核心"
-        subgraph "配置解析"
-            QC[QuantizationConfig]
-            KVC[KVCacheQuantConfig]
-            WC[WeightQuantConfig]
-            AC[ActivationQuantConfig]
-            Config --> QC
-            QC --> KVC
-            QC --> WC
-            QC --> AC
-        end
-
-        subgraph "工厂与注册表"
-            Factory[QuantizationStrategyFactory<br/>create_from_config<br/>create_kv_cache_strategy]
-            RegKV[KV Cache Registry<br/>@register_kv_cache_strategy]
-            RegLinear[Linear Registry<br/>@register_linear_strategy]
-            Factory --> RegKV
-            Factory --> RegLinear
-        end
-
-        subgraph "上下文管理"
-            Context[QuantizationContext<br/>Thread-Local Storage]
-            Context --> |存储| KVStrategy[KV Cache Strategy]
-            Context --> |存储| LinearAttn[Linear Attn Strategy]
-            Context --> |存储| LinearMLP[Linear MLP Strategy]
-            Context --> |存储| LinearOther[Linear Other Strategy]
-            Context --> |缓存| ActCache[Activation Quant Cache<br/>Step-Local]
-        end
-
-        subgraph "策略接口层"
-            BaseStrategy[QuantizationStrategy<br/>quantize/dequantize<br/>get_storage_dtype]
-            KVInterface[KVCacheQuantizationStrategy<br/>compute_scales<br/>update_scales<br/>quantize_kv_for_store]
-            LinearInterface[LinearQuantizationStrategy<br/>linear_forward<br/>quantize_weight_for_kernel<br/>quantize_act_for_kernel]
-            BaseStrategy --> KVInterface
-            BaseStrategy --> LinearInterface
-        end
-
-        subgraph "KV Cache策略实现"
-            KVBF16[KVCacheBF16Strategy<br/>BF16存储]
-            KVFP8[KVCacheFP8RunningMaxStrategy<br/>FP8 E4M3/E5M2<br/>Running Max Scale]
-            KVInterface --> KVBF16
-            KVInterface --> KVFP8
-        end
-
-        subgraph "Linear策略实现"
-            LBF16[LinearBF16Strategy<br/>BF16/BF16]
-            LGPTQ[LinearGPTQW4A16Strategy<br/>GPTQ W4/BF16]
-            LAWQ[LinearAWQW4A16Strategy<br/>AWQ W4/BF16]
-            LInt8W8A16[LinearInt8W8A16Strategy<br/>INT8/BF16]
-            LInt8W8A8[LinearInt8W8A8Strategy<br/>INT8/INT8]
-            LInt4W4A16[LinearInt4W4A16Strategy<br/>INT4/BF16]
-            LInt4W4A8[LinearInt4W4A8Strategy<br/>INT4/INT8]
-            LFP8W8A16[LinearFP8W8A16Strategy<br/>FP8/BF16]
-            LFP8W8A8[LinearFP8W8A8Strategy<br/>FP8/FP8]
-            LinearInterface --> LBF16
-            LinearInterface --> LGPTQ
-            LinearInterface --> LAWQ
-            LinearInterface --> LInt8W8A16
-            LinearInterface --> LInt8W8A8
-            LinearInterface --> LInt4W4A16
-            LinearInterface --> LInt4W4A8
-            LinearInterface --> LFP8W8A16
-            LinearInterface --> LFP8W8A8
-        end
-
-        subgraph "工具函数"
-            KVDType[kv_cache_dtype.py<br/>parse_kv_cache_dtype<br/>view_fp8_cache<br/>ensure_scale_tensor]
-        end
-    end
-
-    subgraph "运行时模块"
-        subgraph "模型运行器"
-            MR[ModelRunnerBase<br/>__init__]
-            MR --> |初始化| Factory
-            MR --> |获取| Context
-        end
-
-        subgraph "Linear层"
-            Linear[LinearBase<br/>ReplicatedLinear<br/>ColumnParallelLinear<br/>RowParallelLinear]
-            Linear --> |forward| Context
-            Linear --> |quantize_weight| Context
-        end
-
-        subgraph "KV Cache Kernels"
-            KVKernel[kv_cache_kernels.py<br/>dllm_flash_attn_kernels.py]
-            KVKernel --> |获取策略| Context
-            KVKernel --> |更新scale| KVStrategy
-        end
-
-        subgraph "注意力实现"
-            Attn[attn_impl.py]
-            Attn --> |获取策略| Context
-        end
-
-        subgraph "TP Worker"
-            TP[tp_worker.py]
-            TP --> |清理缓存| Context
-        end
-    end
-
-    subgraph "离线量化工具"
-        Offline[quantize_model.py<br/>GPTQ/AWQ离线量化]
-    end
-
-    %% 连接关系
-    QC --> Factory
-    Factory --> Context
-    RegKV --> KVBF16
-    RegKV --> KVFP8
-    RegLinear --> LBF16
-    RegLinear --> LGPTQ
-    RegLinear --> LAWQ
-    RegLinear --> LInt8W8A16
-    RegLinear --> LInt8W8A8
-    RegLinear --> LInt4W4A16
-    RegLinear --> LInt4W4A8
-    RegLinear --> LFP8W8A16
-    RegLinear --> LFP8W8A8
-    KVStrategy --> KVInterface
-    LinearAttn --> LinearInterface
-    LinearMLP --> LinearInterface
-    LinearOther --> LinearInterface
-    KVDType --> KVFP8
-
-    style Config fill:#e1f5ff
-    style QC fill:#fff4e1
-    style Factory fill:#fff4e1
-    style Context fill:#e8f5e9
-    style KVInterface fill:#f3e5f5
-    style LinearInterface fill:#f3e5f5
-    style KVBF16 fill:#fff9c4
-    style KVFP8 fill:#fff9c4
-    style LGPTQ fill:#fff9c4
-    style LAWQ fill:#fff9c4
-    style MR fill:#ffebee
-    style Linear fill:#ffebee
-    style KVKernel fill:#ffebee
-```
-
-## 数据流图
-
-```mermaid
-sequenceDiagram
-    participant Config as Diffulex Config
-    participant Factory as QuantizationStrategyFactory
-    participant Context as QuantizationContext
-    participant KVStrategy as KV Cache Strategy
-    participant LinearStrategy as Linear Strategy
-    participant ModelRunner as ModelRunner
-    participant LinearLayer as Linear Layer
-    participant KVKernel as KV Cache Kernel
-
-    Note over Config,KVKernel: 初始化阶段
-    Config->>Factory: create_from_config(config)
-    Factory->>Context: 创建并配置上下文
-    Factory->>KVStrategy: 创建KV cache策略
-    Factory->>LinearStrategy: 创建Linear策略(attn/mlp/other)
-    Context->>Context: 存储策略实例
-
-    Note over ModelRunner,KVKernel: 运行时阶段
-    ModelRunner->>Context: get_kv_cache_strategy()
-    Context->>KVStrategy: 返回策略实例
-    ModelRunner->>KVStrategy: init_scales()
-    KVStrategy->>KVStrategy: 初始化scale张量
-
-    LinearLayer->>Context: get_linear_strategy(quant_kind)
-    Context->>LinearStrategy: 返回策略实例
-    LinearLayer->>LinearStrategy: linear_forward(x, weight, bias)
-    LinearStrategy->>LinearStrategy: 执行量化计算
-
-    KVKernel->>Context: get_kv_cache_strategy()
-    Context->>KVStrategy: 返回策略实例
-    KVKernel->>KVStrategy: update_scales(k, v, k_scale, v_scale)
-    KVStrategy->>KVStrategy: 更新running max scale
-    KVKernel->>KVStrategy: quantize_kv_for_store(k, v, scales)
-    KVStrategy->>KVKernel: 返回量化后的K和V
-```
-
-## 策略选择流程图
-
-```mermaid
-flowchart TD
-    Start[开始] --> LoadConfig[加载Diffulex Config]
-    LoadConfig --> ParseConfig[解析QuantizationConfig]
-    ParseConfig --> CheckKVCache{检查kv_cache_dtype}
-    
-    CheckKVCache -->|bf16/fp16/fp32| CreateKVBF16[创建KVCacheBF16Strategy]
-    CheckKVCache -->|fp8/fp8_e4m3| CreateKVFP8E4M3[创建KVCacheFP8RunningMaxStrategy<br/>E4M3]
-    CheckKVCache -->|fp8_e5m2| CreateKVFP8E5M2[创建KVCacheFP8RunningMaxStrategy<br/>E5M2]
-    
-    ParseConfig --> CheckLinearAttn{检查linear_attn配置}
-    CheckLinearAttn -->|weight_dtype + act_dtype| CreateLinearAttn[创建Linear策略<br/>注册到linear_attn]
-    
-    ParseConfig --> CheckLinearMLP{检查linear_mlp配置}
-    CheckLinearMLP -->|weight_dtype + act_dtype| CreateLinearMLP[创建Linear策略<br/>注册到linear_mlp]
-    
-    CreateKVBF16 --> RegisterContext[注册到QuantizationContext]
-    CreateKVFP8E4M3 --> RegisterContext
-    CreateKVFP8E5M2 --> RegisterContext
-    CreateLinearAttn --> RegisterContext
-    CreateLinearMLP --> RegisterContext
-    
-    RegisterContext --> End[完成初始化]
-    
-    style CheckKVCache fill:#e1f5ff
-    style CheckLinearAttn fill:#e1f5ff
-    style CheckLinearMLP fill:#e1f5ff
-    style RegisterContext fill:#e8f5e9
-```
-
-## Linear量化决策流程图
-
-```mermaid
-flowchart TD
-    Start[Linear.forward调用] --> GetStrategy[get_linear_strategy<br/>quant_kind]
-    GetStrategy --> CheckOffline{检查离线量化权重<br/>GPTQ/AWQ}
-    
-    CheckOffline -->|有GPTQ权重| UseGPTQ[使用GPTQ策略<br/>linear_forward<br/>传递qweight/qzeros/scales]
-    CheckOffline -->|有AWQ权重| UseAWQ[使用AWQ策略<br/>linear_forward<br/>传递qweight/qzeros/scales]
-    CheckOffline -->|无离线量化| CheckOnline{检查在线量化权重<br/>int8/int4/fp8}
-    
-    CheckOnline -->|有量化权重| UseOnline[使用量化策略<br/>linear_forward<br/>传递quant_weight_int8/scales]
-    CheckOnline -->|无量化权重| CheckStrategy{检查策略}
-    
-    CheckStrategy -->|有策略| UseStrategy[使用策略<br/>linear_forward<br/>传递bf16 weight]
-    CheckStrategy -->|无策略| UseDefault[使用默认F.linear<br/>bf16 weight]
-    
-    UseGPTQ --> TryKernel{尝试TileLang Kernel}
-    TryKernel -->|成功| KernelResult[Kernel计算结果]
-    TryKernel -->|失败| PythonFallback[Python Fallback<br/>dequantize + F.linear]
-    
-    UseAWQ --> TryKernel
-    UseOnline --> KernelOrPython[Kernel或Python实现]
-    UseStrategy --> KernelOrPython
-    UseDefault --> Result[返回结果]
-    
-    KernelResult --> Result
-    PythonFallback --> Result
-    KernelOrPython --> Result
-    
-    style CheckOffline fill:#e1f5ff
-    style CheckOnline fill:#e1f5ff
-    style CheckStrategy fill:#e1f5ff
-    style TryKernel fill:#fff9c4
-```
-
-## KV Cache量化流程图
-
-### 完整KV Cache量化流程（包含Store和Load）
-
-```mermaid
-flowchart TB
-    subgraph "Store阶段"
-        Start[KV Cache Store] --> GetStrategy1[get_kv_cache_strategy]
-        GetStrategy1 --> CheckFormat1{检查kv_cache_format}
-        
-        CheckFormat1 -->|bf16| BF16Store[BF16 Store路径]
-        CheckFormat1 -->|fp8| FP8Store[FP8 Store路径]
-        
-        BF16Store --> StoreBF16[直接存储为BF16<br/>dtype: bfloat16<br/>无需量化]
-        
-        FP8Store --> UpdateScales["update_scales<br/>更新running max scale<br/>k_scale/v_scale: float32<br/>shape: (num_kv_heads)"]
-        UpdateScales --> QuantizeKV["quantize_kv_for_store<br/>K/V: bfloat16 -> uint8<br/>使用k_scale/v_scale量化"]
-        QuantizeKV --> StoreFP8["存储为uint8<br/>dtype: uint8<br/>FP8格式"]
-        
-        StoreBF16 --> CheckLayout1{检查Layout}
-        StoreFP8 --> CheckLayout1
-        
-        CheckLayout1 -->|unified| StoreUnified["store_kvcache_unified_layout<br/>shape: (num_blocks, page_size, num_kv_heads, head_dim)"]
-        CheckLayout1 -->|distinct| StoreDistinct["store_kvcache_distinct_layout<br/>k_cache: (num_blks, h, hdim//x, blk_sz, x)<br/>v_cache: (num_blks, h, hdim, blk_sz)"]
-    end
-    
-    subgraph "Load阶段"
-        LoadStart[KV Cache Load] --> GetStrategy2[get_kv_cache_strategy]
-        GetStrategy2 --> CheckFormat2{检查kv_cache_format}
-        
-        CheckFormat2 -->|bf16| BF16Load[BF16 Load路径]
-        CheckFormat2 -->|fp8| FP8Load[FP8 Load路径]
-        
-        BF16Load --> CheckLayout2{检查Layout}
-        FP8Load --> CheckLayout2
-        
-        CheckLayout2 -->|unified| UnifiedLoad[Unified Layout Load]
-        CheckLayout2 -->|distinct| DistinctLoad[Distinct Layout Load<br/>总是使用varlen路径]
-        
-        UnifiedLoad --> CheckDecodeMode{检查decode_mode}
-        CheckDecodeMode -->|static| StaticPath[Static模式<br/>TileLang Kernel]
-        CheckDecodeMode -->|varlen| VarlenPath[Varlen模式<br/>load_kvcache + flash_attn_varlen_func]
-        
-        DistinctLoad --> VarlenPath
-        
-        StaticPath --> StaticBF16{BF16?}
-        StaticPath --> StaticFP8{FP8?}
-        
-        StaticBF16 --> TileLangBF16[dllm_flash_attn_decode_kernel<br/>TileLang Kernel<br/>输入: q/k/v/cache bfloat16<br/>输出: bfloat16]
-        
-        StaticFP8 --> ViewFP8Cache[strategy.view_kv_cache_for_kernels<br/>uint8 -> float8 view<br/>dtype转换]
-        ViewFP8Cache --> TileLangFP8[dllm_flash_attn_decode_kernel_bf16_q_fp8_kv<br/>TileLang Kernel<br/>输入: q bfloat16, cache float8<br/>k_scale/v_scale float32<br/>kernel内反量化+scale<br/>输出: bfloat16]
-        
-        VarlenPath --> LoadKVCache[load_kvcache函数]
-        LoadKVCache --> LoadBF16{BF16?}
-        LoadKVCache --> LoadFP8{FP8?}
-        
-        LoadBF16 --> LoadBF16Kernel[_load_kvcache_bf16<br/>Triton Kernel<br/>gather cache blocks<br/>输出: bfloat16]
-        
-        LoadFP8 --> LoadFP8Kernel[_load_kvcache_fp8<br/>Triton Fused Kernel<br/>gather + dequant + scale<br/>输入: cache uint8/float8 view<br/>k_scale/v_scale float32<br/>输出: bfloat16]
-        
-        LoadBF16Kernel --> FlashAttnBF16[flash_attn_varlen_func<br/>输入: q/k_comb/v_comb bfloat16<br/>输出: bfloat16]
-        LoadFP8Kernel --> FlashAttnFP8[flash_attn_varlen_func<br/>输入: q/k_comb/v_comb bfloat16<br/>输出: bfloat16]
-    end
-    
-    StoreUnified --> LoadStart
-    StoreDistinct --> LoadStart
-    TileLangBF16 --> End[完成]
-    TileLangFP8 --> End
-    FlashAttnBF16 --> End
-    FlashAttnFP8 --> End
-    
-    style CheckFormat1 fill:#e1f5ff
-    style CheckFormat2 fill:#e1f5ff
-    style CheckLayout1 fill:#fff9c4
-    style CheckLayout2 fill:#fff9c4
-    style CheckDecodeMode fill:#fff9c4
-    style QuantizeKV fill:#ffebee
-    style ViewFP8Cache fill:#ffebee
-    style StaticPath fill:#e8f5e9
-    style VarlenPath fill:#e8f5e9
-```
-
-### 数据类型传递详细图
-
-```mermaid
-sequenceDiagram
-    participant AttnImpl as Attention Implementation
-    participant Strategy as KV Cache Strategy
-    participant StoreKernel as Store Kernel
-    participant Cache as KV Cache Storage
-    participant LoadKernel as Load Kernel
-    participant DecodeKernel as Decode Kernel
-    participant FlashAttn as flash_attn_varlen_func
-    
-    Note over AttnImpl,FlashAttn: BF16路径 (Unified Layout, Static Mode)
-    AttnImpl->>Strategy: get_kv_cache_strategy()
-    Strategy-->>AttnImpl: KVCacheBF16Strategy
-    AttnImpl->>AttnImpl: k: (N, H, D) bfloat16<br/>v: (N, H, D) bfloat16
-    AttnImpl->>StoreKernel: store_kvcache_unified_layout<br/>k, v, cache, slot_mapping
-    StoreKernel->>Cache: 直接存储<br/>dtype: bfloat16<br/>shape: (num_blocks, page_size, H, D)
-    AttnImpl->>DecodeKernel: dllm_flash_attn_decode<br/>q: bfloat16<br/>k_cache: bfloat16<br/>v_cache: bfloat16
-    DecodeKernel->>DecodeKernel: TileLang Kernel<br/>内部gather + attention计算
-    DecodeKernel-->>AttnImpl: output: bfloat16
-    
-    Note over AttnImpl,FlashAttn: FP8路径 (Unified Layout, Static Mode)
-    AttnImpl->>Strategy: get_kv_cache_strategy()
-    Strategy-->>AttnImpl: KVCacheFP8RunningMaxStrategy
-    AttnImpl->>AttnImpl: k: (N, H, D) bfloat16<br/>v: (N, H, D) bfloat16
-    AttnImpl->>Strategy: update_scales(k, v, k_scale, v_scale)
-    Strategy-->>AttnImpl: k_scale: (H) float32<br/>v_scale: (H) float32
-    AttnImpl->>Strategy: quantize_kv_for_store(k, v, k_scale, v_scale)
-    Strategy->>Strategy: 量化: k/v bfloat16 -> uint8<br/>使用scale进行量化
-    Strategy-->>AttnImpl: k_q: (N, H, D) uint8<br/>v_q: (N, H, D) uint8
-    AttnImpl->>StoreKernel: store_kvcache_unified_layout<br/>k_q, v_q (uint8)
-    StoreKernel->>Cache: 存储为uint8<br/>dtype: uint8<br/>shape: (num_blocks, page_size, H, D)
-    AttnImpl->>Strategy: view_kv_cache_for_kernels(cache)
-    Strategy->>Strategy: uint8 -> float8 view<br/>dtype转换（不改变存储）
-    Strategy-->>AttnImpl: cache_fp8: float8 view
-    AttnImpl->>DecodeKernel: dllm_flash_attn_decode_bf16_q_fp8_kv<br/>q: bfloat16<br/>k_cache: float8 view<br/>v_cache: float8 view<br/>k_scale: (H) float32<br/>v_scale: (H) float32
-    DecodeKernel->>DecodeKernel: TileLang Kernel<br/>内部: gather + dequant + scale + attention<br/>float8 -> bfloat16 (反量化)
-    DecodeKernel-->>AttnImpl: output: bfloat16
-    
-    Note over AttnImpl,FlashAttn: FP8路径 (Unified/Distinct Layout, Varlen Mode)
-    AttnImpl->>Strategy: get_kv_cache_strategy()
-    Strategy-->>AttnImpl: KVCacheFP8RunningMaxStrategy
-    AttnImpl->>Strategy: update_scales(k, v, k_scale, v_scale)
-    Strategy-->>AttnImpl: k_scale: (H) float32<br/>v_scale: (H) float32
-    AttnImpl->>Strategy: quantize_kv_for_store(k, v, k_scale, v_scale)
-    Strategy-->>AttnImpl: k_q: (N, H, D) uint8<br/>v_q: (N, H, D) uint8
-    AttnImpl->>StoreKernel: store_kvcache_*_layout<br/>k_q, v_q (uint8)
-    StoreKernel->>Cache: 存储为uint8<br/>dtype: uint8
-    AttnImpl->>LoadKernel: load_kvcache(cache, metadata, k_new, v_new)
-    LoadKernel->>Strategy: view_kv_cache_for_kernels(cache)
-    Strategy-->>LoadKernel: cache_fp8: float8 view
-    LoadKernel->>LoadKernel: Triton Fused Kernel<br/>load_kvcache_kernel_fp8_*<br/>输入: cache float8 view<br/>k_scale/v_scale float32<br/>操作: gather + dequant + scale<br/>输出: k_comb/v_comb bfloat16
-    LoadKernel-->>AttnImpl: k_comb: (total_len, H, D) bfloat16<br/>v_comb: (total_len, H, D) bfloat16
-    AttnImpl->>FlashAttn: flash_attn_varlen_func<br/>q: bfloat16<br/>k_comb: bfloat16<br/>v_comb: bfloat16
-    FlashAttn-->>AttnImpl: output: bfloat16
-```
-
-### Layout和Decode模式决策树
-
-```mermaid
-flowchart TD
-    Start[KV Cache操作] --> CheckLayout{检查kv_cache_layout}
-    
-        CheckLayout -->|unified| UnifiedPath["Unified Layout<br/>shape: (num_blocks, page_size, H, D)"]
-        CheckLayout -->|distinct| DistinctPath["Distinct Layout<br/>k: (num_blks, h, hdim//x, blk_sz, x)<br/>v: (num_blks, h, hdim, blk_sz)"]
-    
-    UnifiedPath --> CheckDecodeMode{检查decode_mode}
-    CheckDecodeMode -->|static| UnifiedStatic[Static模式<br/>TileLang Kernel]
-    CheckDecodeMode -->|varlen| UnifiedVarlen[Varlen模式<br/>load_kvcache + flash_attn_varlen_func]
-    
-    DistinctPath --> DistinctVarlen[总是Varlen模式<br/>load_kvcache + flash_attn_varlen_func]
-    
-    UnifiedStatic --> CheckQuant1{量化格式?}
-    CheckQuant1 -->|bf16| StaticBF16[TileLang BF16 Kernel<br/>dllm_flash_attn_decode_kernel<br/>输入/输出: bfloat16]
-    CheckQuant1 -->|fp8| StaticFP8[TileLang FP8 Kernel<br/>dllm_flash_attn_decode_kernel_bf16_q_fp8_kv<br/>输入: q bfloat16, cache float8<br/>scale: float32<br/>输出: bfloat16]
-    
-    UnifiedVarlen --> CheckQuant2{量化格式?}
-    DistinctVarlen --> CheckQuant2
-    
-    CheckQuant2 -->|bf16| VarlenBF16[load_kvcache_bf16<br/>Triton gather kernel<br/>输出: bfloat16<br/>+ flash_attn_varlen_func]
-    CheckQuant2 -->|fp8| VarlenFP8[load_kvcache_fp8<br/>Triton fused kernel<br/>gather + dequant + scale<br/>输入: cache float8, scale float32<br/>输出: bfloat16<br/>+ flash_attn_varlen_func]
-    
-    StaticBF16 --> End[完成]
-    StaticFP8 --> End
-    VarlenBF16 --> End
-    VarlenFP8 --> End
-    
-    style CheckLayout fill:#e1f5ff
-    style CheckDecodeMode fill:#e1f5ff
-    style CheckQuant1 fill:#fff9c4
-    style CheckQuant2 fill:#fff9c4
-    style UnifiedStatic fill:#e8f5e9
-    style UnifiedVarlen fill:#e8f5e9
-    style DistinctVarlen fill:#e8f5e9
-    style StaticFP8 fill:#ffebee
-    style VarlenFP8 fill:#ffebee
-```
-
-### 详细数据流图：Unified Layout Static模式（FP8）
-
-```mermaid
-flowchart LR
-    subgraph "Store阶段"
-        K1["K: bfloat16<br/>(N, H, D)"] --> UpdateScale["update_scales<br/>计算/更新scale"]
-        V1["V: bfloat16<br/>(N, H, D)"] --> UpdateScale
-        UpdateScale --> KScale["k_scale: float32<br/>(H)"]
-        UpdateScale --> VScale["v_scale: float32<br/>(H)"]
-        K1 --> Quantize["quantize_kv_for_store<br/>使用scale量化"]
-        V1 --> Quantize
-        KScale --> Quantize
-        VScale --> Quantize
-        Quantize --> KQ["K_q: uint8<br/>(N, H, D)"]
-        Quantize --> VQ["V_q: uint8<br/>(N, H, D)"]
-        KQ --> Store["store_kvcache_unified_layout<br/>Triton Kernel"]
-        VQ --> Store
-        Store --> Cache["Cache: uint8<br/>(num_blocks, page_size, H, D)"]
-    end
-    
-    subgraph "Load阶段 - Static模式"
-        Cache --> View["view_kv_cache_for_kernels<br/>uint8 -> float8 view"]
-        View --> CacheFP8["Cache: float8 view<br/>(num_blocks, page_size, H, D)"]
-        Q["Q: bfloat16<br/>(num_seqs, num_heads, D)"] --> DecodeKernel
-        CacheFP8 --> DecodeKernel["dllm_flash_attn_decode_kernel_bf16_q_fp8_kv<br/>TileLang Kernel"]
-        KScale --> DecodeKernel
-        VScale --> DecodeKernel
-        DecodeKernel --> Output["Output: bfloat16<br/>(num_seqs, num_heads, D)"]
-    end
-    
-    style UpdateScale fill:#fff9c4
-    style Quantize fill:#ffebee
-    style View fill:#ffebee
-    style DecodeKernel fill:#e8f5e9
-```
-
-### 详细数据流图：Varlen模式（FP8，Unified/Distinct Layout）
-
-```mermaid
-flowchart LR
-    subgraph "Store阶段"
-        K1["K: bfloat16<br/>(N, H, D)"] --> UpdateScale["update_scales<br/>计算/更新scale"]
-        V1["V: bfloat16<br/>(N, H, D)"] --> UpdateScale
-        UpdateScale --> KScale["k_scale: float32<br/>(H)"]
-        UpdateScale --> VScale["v_scale: float32<br/>(H)"]
-        K1 --> Quantize["quantize_kv_for_store<br/>使用scale量化"]
-        V1 --> Quantize
-        KScale --> Quantize
-        VScale --> Quantize
-        Quantize --> KQ["K_q: uint8<br/>(N, H, D)"]
-        Quantize --> VQ["V_q: uint8<br/>(N, H, D)"]
-        KQ --> Store{Layout?}
-        VQ --> Store
-        Store -->|unified| StoreUnified["store_kvcache_unified_layout"]
-        Store -->|distinct| StoreDistinct["store_kvcache_distinct_layout"]
-        StoreUnified --> CacheU["Cache: uint8<br/>Unified: (num_blocks, page_size, H, D)"]
-        StoreDistinct --> CacheD["Cache: uint8<br/>Distinct: k (num_blks, h, hdim//x, blk_sz, x)<br/>v (num_blks, h, hdim, blk_sz)"]
-    end
-    
-    subgraph "Load阶段 - Varlen模式"
-        CacheU --> LoadKernel
-        CacheD --> LoadKernel["load_kvcache<br/>Triton Fused Kernel"]
-        KNew["K_new: bfloat16<br/>(N_new, H, D)"] --> LoadKernel
-        VNew["V_new: bfloat16<br/>(N_new, H, D)"] --> LoadKernel
-        KScale --> LoadKernel
-        VScale --> LoadKernel
-        Metadata["attn_metadata<br/>block_tables, cu_seqlens, etc."] --> LoadKernel
-        LoadKernel --> View["view_kv_cache_for_kernels<br/>uint8 -> float8 view"]
-        View --> GatherDequant["load_kvcache_kernel_fp8_*<br/>gather + dequant + scale<br/>float8 -> bfloat16"]
-        GatherDequant --> KComb["K_comb: bfloat16<br/>(total_len, H, D)"]
-        GatherDequant --> VComb["V_comb: bfloat16<br/>(total_len, H, D)"]
-        Q["Q: bfloat16<br/>(total_len, num_heads, D)"] --> FlashAttn
-        KComb --> FlashAttn["flash_attn_varlen_func<br/>Flash Attention"]
-        VComb --> FlashAttn
-        FlashAttn --> Output["Output: bfloat16<br/>(total_len, num_heads, D)"]
-    end
-    
-    style UpdateScale fill:#fff9c4
-    style Quantize fill:#ffebee
-    style View fill:#ffebee
-    style GatherDequant fill:#ffebee
-    style FlashAttn fill:#e8f5e9
-```
-
-### 关键数据类型转换总结表
-
-| 阶段 | 操作 | 输入类型 | 输出类型 | 说明 |
-|------|------|---------|---------|------|
-| **Store (BF16)** | 直接存储 | `bfloat16 [N, H, D]` | `bfloat16 [num_blocks, page_size, H, D]` | 无需量化，直接存储 |
-| **Store (FP8)** | quantize_kv_for_store | `bfloat16 [N, H, D]` + `float32 [H]` scale | `uint8 [N, H, D]` | 量化并存储为uint8 |
-| **Store (FP8)** | 存储到cache | `uint8 [N, H, D]` | `uint8 [num_blocks, page_size, H, D]` | 存储为uint8格式 |
-| **Load (Static FP8)** | view_kv_cache_for_kernels | `uint8 [num_blocks, page_size, H, D]` | `float8 view [num_blocks, page_size, H, D]` | 视图转换，不改变存储 |
-| **Load (Static FP8)** | TileLang Kernel | `float8 view` + `float32 [H]` scale | `bfloat16 [num_seqs, num_heads, D]` | Kernel内反量化+scale |
-| **Load (Varlen FP8)** | view_kv_cache_for_kernels | `uint8 [num_blocks, page_size, H, D]` | `float8 view [num_blocks, page_size, H, D]` | 视图转换 |
-| **Load (Varlen FP8)** | Triton Fused Kernel | `float8 view` + `float32 [H]` scale | `bfloat16 [total_len, H, D]` | gather + dequant + scale |
-| **Attention** | flash_attn_varlen_func | `bfloat16 [total_len, num_heads, D]` | `bfloat16 [total_len, num_heads, D]` | Flash Attention计算 |
-
-### 路径选择决策表
-
-| Layout | Decode Mode | 量化格式 | Store Kernel | Load Kernel | Attention Kernel |
-|--------|-------------|---------|--------------|-------------|------------------|
-| Unified | static | bf16 | `store_kvcache_unified_layout` → BF16 kernel | 无（直接使用cache） | `dllm_flash_attn_decode_kernel` (TileLang) |
-| Unified | static | fp8 | `store_kvcache_unified_layout` → FP8 kernel | `view_kv_cache_for_kernels` | `dllm_flash_attn_decode_kernel_bf16_q_fp8_kv` (TileLang) |
-| Unified | varlen | bf16 | `store_kvcache_unified_layout` → BF16 kernel | `load_kvcache_bf16` (Triton) | `flash_attn_varlen_func` |
-| Unified | varlen | fp8 | `store_kvcache_unified_layout` → FP8 kernel | `load_kvcache_fp8` (Triton fused) | `flash_attn_varlen_func` |
-| Distinct | varlen | bf16 | `store_kvcache_distinct_layout` → BF16 kernel | `load_kvcache_bf16` (Triton) | `flash_attn_varlen_func` |
-| Distinct | varlen | fp8 | `store_kvcache_distinct_layout` → FP8 kernel | `load_kvcache_fp8` (Triton fused) | `flash_attn_varlen_func` |
-
-**注意**：
-- Distinct layout **总是**使用varlen模式（因为K的split layout不适合static模式）
-- Static模式**仅支持**Unified layout
-- FP8量化在static模式下，反量化在TileLang kernel内部完成
-- FP8量化在varlen模式下，反量化在`load_kvcache`的Triton fused kernel中完成