diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..a9e8a6d
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,44 @@
+# https://editorconfig.org/
+
+root = true
+
+[*]
+charset = utf-8
+end_of_line = lf
+indent_style = space
+indent_size = 4
+trim_trailing_whitespace = true
+insert_final_newline = true
+
+[*.{py,pyi}]
+indent_size = 4
+
+[*.{cpp,hpp,cxx,cc,c,h,cu,cuh}]
+indent_size = 2
+
+[{*.cmake,CMakeLists.txt}]
+indent_size = 2
+
+[*.{yaml,yml}]
+indent_size = 2
+
+[.clang-{format,tidy}]
+indent_size = 2
+
+[Makefile]
+indent_style = tab
+
+[*.sh]
+indent_size = 4
+
+[*.bat]
+indent_size = 4
+end_of_line = crlf
+
+[*.md]
+indent_size = 2
+x-soft-wrap-text = true
+
+[*.rst]
+indent_size = 4
+x-soft-wrap-text = true
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..bbb14db
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,10 @@
+* text eol=lf
+*.bat eol=crlf
+
+*.svg binary
+*.jpg binary
+*.jpeg binary
+*.png binary
+*.gif binary
+
+*.h linguist-language=C++
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000..0f0d8f8
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,12 @@
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+      day: "monday"
+      time: "12:00"
+      timezone: "Asia/Shanghai"
+    commit-message:
+      prefix: "[CI]"
+
diff --git a/.github/workflows/pr-perfbench-bot.yml b/.github/workflows/pr-perfbench-bot.yml
index c1a357a..a3177d0 100644
--- a/.github/workflows/pr-perfbench-bot.yml
+++ b/.github/workflows/pr-perfbench-bot.yml
@@ -7,6 +7,8 @@ on:
 
 permissions:
   contents: read
+  issues: write
+  pull-requests: write
 
 concurrency:
   group: "${{ github.workflow }}-${{ github.ref }}"
diff --git a/.github/workflows/pr-reminder-bot.yml b/.github/workflows/pr-reminder-bot.yml
index 5689c84..799a149 100644
--- a/.github/workflows/pr-reminder-bot.yml
+++ b/.github/workflows/pr-reminder-bot.yml
@@ -5,6 +5,10 @@ on:
     types:
       - opened
 
+permissions:
+  issues: write
+  pull-requests: write
+
 jobs:
   remind:
     runs-on: ubuntu-latest
diff --git a/.gitignore b/.gitignore
index 560b74d..9db0810 100755
--- a/.gitignore
+++ b/.gitignore
@@ -50,3 +50,4 @@ kernel_diff_analysis.md
 tilelang_optimization_analysis.md
 boundary_check_comparison.md
 GITHUB_ISSUE.md
+Tilelang-failed_test_cases/
\ No newline at end of file
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..f52f91b
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,59 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+ci:
+  autofix_prs: false
+  autofix_commit_msg: "[Lint]: [pre-commit.ci] auto fixes [...]"
+  autoupdate_commit_msg: "[CI] [pre-commit.ci] autoupdate"
+  autoupdate_schedule: monthly
+default_stages: [pre-commit, pre-push, manual]
+exclude: '^(build|3rdparty)/.*$'  # exclude build and 3rdparty directories
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v6.0.0
+    hooks:
+      - id: check-symlinks
+      - id: destroyed-symlinks
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-added-large-files
+      - id: check-merge-conflict
+        fail_fast: true
+      - id: check-executables-have-shebangs
+      - id: check-shebang-scripts-are-executable
+      - id: detect-private-key
+      - id: check-yaml
+      - id: check-toml
+      - id: check-ast
+        fail_fast: true
+      - id: debug-statements
+      - id: file-contents-sorter
+        args: [--ignore-case]
+        files: ^docs/spelling_wordlist\.txt$
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v21.1.7  # sync with requirements-lint.txt
+    hooks:
+      - id: clang-format
+        types_or: [c++, c]
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.14.9  # sync with requirements-lint.txt
+    hooks:
+      - id: ruff-check
+        args: [--fix, --exit-non-zero-on-fix]
+      - id: ruff-format
+        args: [--exit-non-zero-on-format]
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.4.1  # sync with requirements-lint.txt
+    hooks:
+      - id: codespell
+        additional_dependencies: [".[toml]"]
+        exclude: |
+          (?x)(
+            ^.+\.(cpp|hpp|cxx|cc|c|h|cu|cuh)$|
+            ^.+\.svg$|
+            ^.*\brequirements\b.*\.txt$
+          )
+  - repo: https://github.com/jackdewinter/pymarkdown
+    rev: v0.9.33
+    hooks:
+      - id: pymarkdown
+        args: ["--config", ".pymarkdown", "fix"]
diff --git a/.pymarkdown b/.pymarkdown
new file mode 100644
index 0000000..5394265
--- /dev/null
+++ b/.pymarkdown
@@ -0,0 +1,37 @@
+{
+  "plugins": {
+    "md003": {
+      "style": "atx"
+    },
+    "md004": {
+      "style": "dash"
+    },
+    "md013": {
+      "enabled": false
+    },
+    "md026": {
+      "enabled": false
+    },
+    "md029": {
+      "enabled": false
+    },
+    "md031": {
+      "enabled": false
+    },
+    "md032": {
+      "enabled": false
+    },
+    "md033": {
+      "enabled": false
+    },
+    "md034": {
+      "enabled": false
+    },
+    "md040": {
+      "enabled": false
+    },
+    "md041": {
+      "enabled": false
+    }
+  }
+}
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000..5eba904
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,132 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socioeconomic status,
+nationality, personal appearance, race, caste, color, religion, or sexual
+identity and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+- Demonstrating empathy and kindness toward other people
+- Being respectful of differing opinions, viewpoints, and experiences
+- Giving and gracefully accepting constructive feedback
+- Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+- Focusing on what is best not just for us as individuals, but for the overall
+  community
+
+Examples of unacceptable behavior include:
+
+- The use of sexualized language or imagery, and sexual attention or advances of
+  any kind
+- Trolling, insulting or derogatory comments, and personal or political attacks
+- Public or private harassment
+- Publishing others' private information, such as a physical or email address,
+  without their explicit permission
+- Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+[leiwang1999@outlook.com](mailto:leiwang1999@outlook.com)
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series of
+actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or permanent
+ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within the
+community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.1, available at
+[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
+
+Community Impact Guidelines were inspired by
+[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
+
+For answers to common questions about this code of conduct, see the FAQ at
+[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
+[https://www.contributor-covenant.org/translations][translations].
+
+[homepage]: https://www.contributor-covenant.org
+[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
+[Mozilla CoC]: https://github.com/mozilla/diversity
+[FAQ]: https://www.contributor-covenant.org/faq
+[translations]: https://www.contributor-covenant.org/translations
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..49a659f
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,110 @@
+# Contributing
+
+That would be awesome if you want to contribute something to Diffulex!
+
+## Table of Contents  <!-- omit in toc --> <!-- markdownlint-disable heading-increment -->
+
+- [Report Bugs](#report-bugs)
+- [Ask Questions](#ask-questions)
+- [Submit Pull Requests](#submit-pull-requests)
+- [Setup Development Environment](#setup-development-environment)
+- [Install Develop Version](#install-develop-version)
+- [Lint Check](#lint-check)
+- [Test Locally](#test-locally)
+- [Build Wheels](#build-wheels)
+- [Documentation](#documentation)
+
+## Report Bugs
+
+If you run into any weird behavior while using Diffulex, feel free to open a new issue in this repository! Please run a **search before opening** a new issue, to make sure that someone else hasn't already reported or solved the bug you've found.
+
+Any issue you open must include:
+
+- Code snippet that reproduces the bug with a minimal setup.
+- A clear explanation of what the issue is.
+
+## Ask Questions
+
+Please ask questions in issues.
+
+## Submit Pull Requests
+
+All pull requests are super welcomed and greatly appreciated! Issues in need of a solution are marked with a [`♥ help`](https://github.com/zhijie-group/Diffulex/issues?q=is%3Aissue+is%3Aopen+label%3A%22%E2%99%A5+help%22) label if you're looking for somewhere to start.
+
+If you're new to contributing to Diffulex, you can follow the following guidelines before submitting a pull request.
+
+> [!NOTE]
+> Please include tests and docs with every pull request if applicable!
+
+## Setup Development Environment
+
+Before contributing to Diffulex, please follow the instructions below to setup.
+
+1. Fork Diffulex ([fork](https://github.com/zhijie-group/Diffulex/fork)) on GitHub and clone the repository.
+
+    ```bash
+    git clone --recurse-submodules git@github.com:<your username>/Diffulex.git  # use the SSH protocol
+    cd Diffulex
+
+    git remote add upstream git@github.com:zhijie-group/Diffulex.git
+    ```
+
+2. Setup a development environment:
+
+    ```bash
+    uv venv --seed .venv  # use `python3 -m venv .venv` if you don't have `uv`
+
+    source .venv/bin/activate
+    python3 -m pip install --upgrade pip setuptools wheel "build[uv]"
+    uv pip install --requirements requirements-dev.txt
+    ```
+
+3. Setup the [`pre-commit`](https://pre-commit.com) hooks:
+
+    ```bash
+    pre-commit install --install-hooks
+    ```
+
+Then you are ready to rock. Thanks for contributing to Diffulex!
+
+## Install Develop Version
+
+To install Diffulex in an "editable" mode, run:
+
+```bash
+python3 -m pip install --no-build-isolation --verbose --editable .
+```
+
+in the main directory. This installation is removable by:
+
+```bash
+python3 -m pip uninstall diffulex
+```
+
+We also recommend installing Diffulex in a more manual way for better control over the build process, by compiling the C++ extensions first and set the `PYTHONPATH`. See the documentation for detailed instructions.
+
+## Lint Check
+
+To check the linting, run:
+
+```bash
+pre-commit run --all-files
+```
+
+## Test Locally
+
+To run the tests, start by building the project as described in the [Setup Development Environment](#setup-development-environment) section.
+
+Then you can rerun the tests with:
+
+```bash
+python3 -m pytest testing
+```
+
+## Build Wheels
+
+_TBA_
+
+## Documentation
+
+_TBA_
diff --git a/Tilelang-failed_test_cases b/Tilelang-failed_test_cases
deleted file mode 160000
index f83a764..0000000
--- a/Tilelang-failed_test_cases
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit f83a764960088a375366d39d8376c3da6640e64a
diff --git a/diffulex/__init__.py b/diffulex/__init__.py
index 63dd056..2f67128 100755
--- a/diffulex/__init__.py
+++ b/diffulex/__init__.py
@@ -1,4 +1,13 @@
 from diffulex.diffulex import Diffulex
 from diffulex.sampling_params import SamplingParams
+from diffulex.logger import get_logger, setup_logger, LoggerMixin
 # Import strategies to trigger registration
 from diffulex import strategy, model, sampler # noqa: F401
+
+__all__ = [
+    "Diffulex",
+    "SamplingParams",
+    "get_logger",
+    "setup_logger",
+    "LoggerMixin",
+]
diff --git a/diffulex/attention/__init__.py b/diffulex/attention/__init__.py
index dbd6e52..7e536f8 100644
--- a/diffulex/attention/__init__.py
+++ b/diffulex/attention/__init__.py
@@ -20,7 +20,7 @@ def __getattr__(name):
         try:
             from .attn_impl import Attention
             return Attention
-        except e:
+        except Exception as e:
             raise ImportError(f"Failed to import diffulex.attention.attn_impl.Attention: {e}")
     if name == "fetch_attn_metadata":
         return metadata.fetch_attn_metadata
diff --git a/diffulex/config.py b/diffulex/config.py
index 96af47c..6d8dfba 100755
--- a/diffulex/config.py
+++ b/diffulex/config.py
@@ -1,7 +1,10 @@
 import os
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from transformers import AutoConfig
+from diffulex.logger import get_logger
+
+logger = get_logger(__name__)
 
 
 @dataclass
@@ -31,9 +34,10 @@ class Config:
     master_addr: str = "localhost"
     master_port: int = 2333
     # Shared memory segment name for intra-TP RPC; must be unique per DP group.
-    shm_name: str = "diffuserve_shm"
+    shm_name: str = "diffulex_shm"
     # Start device index for this TP group (set by DP launcher).
     device_start: int = 0
+    device_ids: list[int] = field(default_factory=lambda: [])
     
     enforce_eager: bool = False
     hf_config: AutoConfig | None = None
@@ -56,9 +60,18 @@ def __post_init__(self):
             if not self.lora_path:
                 raise ValueError("lora_path must be provided when use_lora is True")
             if not os.path.exists(self.lora_path):
-                print(f"Warning: LoRA path {self.lora_path} does not exist")
+                logger.warning(f"LoRA path {self.lora_path} does not exist")
 
         self.hf_config = AutoConfig.from_pretrained(self.model, trust_remote_code=True)
         cfg_max_model_len = self.hf_config.max_position_embeddings if hasattr(self.hf_config, "max_position_embeddings") else self.hf_config.max_sequence_length
         self.max_model_len = min(self.max_model_len, cfg_max_model_len)
-        assert self.max_num_batched_tokens >= self.max_model_len
\ No newline at end of file
+        assert self.max_num_batched_tokens >= self.max_model_len
+        
+        if not self.device_ids:
+            import torch
+            self.device_ids = (
+                [int(x) for x in os.environ.get("CUDA_VISIBLE_DEVICES", "").split(",") if x.strip()]
+                if os.environ.get("CUDA_VISIBLE_DEVICES", "")
+                else list(range(torch.cuda.device_count()))
+            )
+            logger.info(f"Using CUDA devices: {self.device_ids}")
\ No newline at end of file
diff --git a/diffulex/diffulex.py b/diffulex/diffulex.py
index 08612ba..8a46e5a 100755
--- a/diffulex/diffulex.py
+++ b/diffulex/diffulex.py
@@ -4,7 +4,7 @@
 
 class Diffulex:
     def __new__(cls, model, **kwargs):
-        cfg = Config(model, **{k: v for k, v in kwargs.items() if k in Config.__dataclass_fields__.keys()})
-        if cfg.data_parallel_size > 1:
+        data_parallel_size = kwargs.get('data_parallel_size', 1)
+        if data_parallel_size > 1:
             return DiffulexDPWorker(model, **kwargs)
         return DiffulexTPWorker(model, **kwargs)
\ No newline at end of file
diff --git a/diffulex/engine/dp_worker.py b/diffulex/engine/dp_worker.py
index 0281930..a76239a 100755
--- a/diffulex/engine/dp_worker.py
+++ b/diffulex/engine/dp_worker.py
@@ -13,6 +13,9 @@
 from diffulex.config import Config
 from diffulex.engine.tp_worker import DiffulexTPWorker
 from diffulex.sampling_params import SamplingParams
+from diffulex.logger import get_logger
+
+logger = get_logger(__name__)
 
 
 def _dp_child_entry(config: Config, dp_idx: int, local_devices: list[int], conn):
@@ -23,11 +26,12 @@ def _dp_child_entry(config: Config, dp_idx: int, local_devices: list[int], conn)
             faulthandler.enable(all_threads=True)
         except Exception:
             pass
-        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(x) for x in local_devices)
+        # os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(x) for x in local_devices)
         cfg = Config(
             model=config.model,
             lora_path=config.lora_path,
             model_name=config.model_name,
+            decoding_strategy=config.decoding_strategy,
             mask_token_id=config.mask_token_id,
             diffusion_block_size=config.diffusion_block_size,
             accept_threshold=config.accept_threshold,
@@ -50,6 +54,7 @@ def _dp_child_entry(config: Config, dp_idx: int, local_devices: list[int], conn)
             kv_cache_layout=config.kv_cache_layout,
         )
         setattr(cfg, "device_start", 0)
+        setattr(cfg, "device_ids", local_devices)
 
         engine = DiffulexTPWorker(cfg.model, **{k: getattr(cfg, k) for k in cfg.__dataclass_fields__.keys() if k != "model"})
 
@@ -79,7 +84,7 @@ def _dp_child_entry(config: Config, dp_idx: int, local_devices: list[int], conn)
             else:
                 conn.send(("err", f"unknown_cmd:{cmd}"))
     except Exception as e:
-        # Include full traceback for easier debugging and also print to stderr as a fallback.
+        # Include full traceback for easier debugging and also log as a fallback.
         tb = traceback.format_exc()
         msg = f"{type(e).__name__}: {e}\n{tb}"
         try:
@@ -87,9 +92,15 @@ def _dp_child_entry(config: Config, dp_idx: int, local_devices: list[int], conn)
         except Exception:
             pass
         try:
-            print(f"[DP Child {dp_idx}] Unhandled exception:\n{msg}", file=sys.stderr, flush=True)
+            # Use logger for error reporting
+            child_logger = get_logger(f"diffulex.engine.dp_worker.child_{dp_idx}")
+            child_logger.error(f"[DP Child {dp_idx}] Unhandled exception:\n{msg}")
         except Exception:
-            pass
+            # Final fallback to stderr
+            try:
+                print(f"[DP Child {dp_idx}] Unhandled exception:\n{msg}", file=sys.stderr, flush=True)
+            except Exception:
+                pass
 
 
 class DiffulexDPWorker:
diff --git a/diffulex/engine/model_runner.py b/diffulex/engine/model_runner.py
index 5b45314..c9b7c80 100755
--- a/diffulex/engine/model_runner.py
+++ b/diffulex/engine/model_runner.py
@@ -14,6 +14,9 @@
 from diffulex.attention.metadata import set_warming_up, reset_warming_up
 from diffulex.model import AutoModelForDiffusionLM
 from diffulex.engine.strategy_registry import DiffulexStrategyRegistry
+from diffulex.logger import get_logger
+
+logger = get_logger(__name__)
 
 
 class ModelRunnerBase(ABC):
@@ -29,8 +32,8 @@ def __init__(self, config: Config, rank: int, event: Event | list[Event]):
 
         # Initialize model, sampler, and kv cache
         init_method = f"tcp://{config.master_addr}:{config.master_port}"
-        dist.init_process_group("nccl", init_method, world_size=self.world_size, rank=rank)
-        device_id = (getattr(config, "device_start", 0) or 0) + rank
+        dist.init_process_group("nccl", init_method, world_size=self.world_size, rank=rank, device_id=config.device_ids[rank])
+        device_id = (getattr(config, "device_start", 0) or 0) + rank + config.device_ids[rank]
         assert 0 <= device_id < torch.cuda.device_count(), f"Invalid device_id {device_id}."
         torch.cuda.set_device(device_id)
         default_dtype = torch.get_default_dtype()
@@ -120,7 +123,7 @@ def load_sampler(self, config: Config):
         return AutoSampler.from_config(config)
     
     def _prefill_warmup(self):
-        print("Warming up prefill...")
+        logger.info("Warming up prefill...")
         max_num_batched_tokens, max_model_len = (
             self.config.max_num_batched_tokens,
             self.config.max_model_len,
@@ -134,7 +137,7 @@ def _prefill_warmup(self):
         torch.cuda.empty_cache()
 
     def warmup_model(self):
-        print("Warming up model...")
+        logger.info("Warming up model...")
         set_warming_up(True)
         torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats()
@@ -184,26 +187,22 @@ def allocate_kv_cache(self):
         except Exception:
             gpu_memory_utilization = config.gpu_memory_utilization
             while num_kvcache_blocks <= 200:
-                print(
-                    "Warning: GPU memory utilization "
-                    f"{gpu_memory_utilization} is too low to allocate kv cache. "
+                logger.warning(
+                    f"GPU memory utilization {gpu_memory_utilization} is too low to allocate kv cache. "
                     "Automatically adding 0.05."
                 )
                 gpu_memory_utilization += 0.05
                 num_kvcache_blocks = get_num_kvcache_blocks(gpu_memory_utilization)
-            print(
+            logger.info(
                 f"Set gpu_memory_utilization to {gpu_memory_utilization:.2f} "
                 "to allocate kv cache."
             )
             config.gpu_memory_utilization = gpu_memory_utilization
 
         config.num_kvcache_blocks = num_kvcache_blocks
-        print(
-            "Allocated {num_blocks} blocks of size {block_size} for kv cache on rank {rank}.".format(
-                num_blocks=config.num_kvcache_blocks,
-                block_size=self.block_size,
-                rank=self.rank,
-            )
+        logger.info(
+            f"Allocated {config.num_kvcache_blocks} blocks of size {self.block_size} "
+            f"for kv cache on rank {self.rank}."
         )
 
         if config.kv_cache_layout == "distinct":
diff --git a/diffulex/engine/tp_worker.py b/diffulex/engine/tp_worker.py
index 3ea53c5..765ed5c 100755
--- a/diffulex/engine/tp_worker.py
+++ b/diffulex/engine/tp_worker.py
@@ -12,6 +12,9 @@
 from diffulex.engine.sequence import AutoSequence
 from diffulex.engine.scheduler import AutoScheduler, SchedulerBase
 from diffulex.engine.model_runner import AutoModelRunner
+from diffulex.logger import get_logger
+
+logger = get_logger(__name__)
 
 
 class DiffulexTPWorker:
@@ -118,7 +121,10 @@ def generate(
                 if use_tqdm:
                     pbar.update(1)
                     
-        print(f"Finished in {n_steps} steps, prefill throughput: {prefill_throughput:.2f} tok/s, decode throughput: {decode_throughput:.2f} tok/s")
+        logger.info(
+            f"Finished in {n_steps} steps, prefill throughput: {prefill_throughput:.2f} tok/s, "
+            f"decode throughput: {decode_throughput:.2f} tok/s"
+        )
         # Ensure all outputs are present
         assert all(toks is not None for toks in outputs), "Some sequences did not produce outputs"
         outputs = [{
diff --git a/diffulex/logger.py b/diffulex/logger.py
new file mode 100644
index 0000000..821feac
--- /dev/null
+++ b/diffulex/logger.py
@@ -0,0 +1,176 @@
+"""
+Professional logging setup with colored output for Diffulex
+"""
+
+import logging
+import sys
+from pathlib import Path
+from typing import Optional
+
+try:
+    from rich.console import Console
+    from rich.logging import RichHandler
+    from rich.traceback import install as install_rich_traceback
+    from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeElapsedColumn
+    RICH_AVAILABLE = True
+except ImportError:
+    RICH_AVAILABLE = False
+
+try:
+    import colorama
+    from colorama import Fore, Style, init as init_colorama
+    COLORAMA_AVAILABLE = True
+    init_colorama(autoreset=True)
+except ImportError:
+    COLORAMA_AVAILABLE = False
+
+
+class ColoredFormatter(logging.Formatter):
+    """Custom formatter with color support"""
+    
+    if COLORAMA_AVAILABLE:
+        COLORS = {
+            'DEBUG': Fore.CYAN,
+            'INFO': Fore.GREEN,
+            'WARNING': Fore.YELLOW,
+            'ERROR': Fore.RED,
+            'CRITICAL': Fore.RED + Style.BRIGHT,
+        }
+    else:
+        COLORS = {}
+    
+    RESET = Style.RESET_ALL if COLORAMA_AVAILABLE else ''
+    
+    def format(self, record):
+        log_color = self.COLORS.get(record.levelname, '')
+        record.levelname = f"{log_color}{record.levelname}{self.RESET}"
+        return super().format(record)
+
+
+def setup_logger(
+    name: str = "diffulex",
+    level: int = logging.INFO,
+    log_file: Optional[str] = None,
+    use_rich: bool = True,
+) -> logging.Logger:
+    """
+    Setup a professional logger with colored output
+    
+    Args:
+        name: Logger name
+        level: Logging level
+        log_file: Optional log file path
+        use_rich: Whether to use rich library for better formatting
+        
+    Returns:
+        Configured logger
+    """
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    logger.handlers.clear()
+    logger.propagate = False  # Prevent propagation to root logger to avoid duplicate output
+    
+    # Use Rich if available and requested
+    if use_rich and RICH_AVAILABLE:
+        console = Console(stderr=True)
+        handler = RichHandler(
+            console=console,
+            show_time=True,
+            show_path=False,
+            rich_tracebacks=True,
+            markup=True,
+        )
+        handler.setFormatter(logging.Formatter(
+            "%(message)s",
+            datefmt="[%X]"
+        ))
+        logger.addHandler(handler)
+        
+        # Install rich traceback
+        install_rich_traceback(show_locals=True)
+    else:
+        # Fallback to colored console handler
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setLevel(level)
+        
+        if COLORAMA_AVAILABLE:
+            formatter = ColoredFormatter(
+                '%(asctime)s | %(levelname)-8s | %(name)s | %(message)s',
+                datefmt='%Y-%m-%d %H:%M:%S'
+            )
+        else:
+            formatter = logging.Formatter(
+                '%(asctime)s | %(levelname)-8s | %(name)s | %(message)s',
+                datefmt='%Y-%m-%d %H:%M:%S'
+            )
+        
+        console_handler.setFormatter(formatter)
+        logger.addHandler(console_handler)
+    
+    # Add file handler if specified
+    if log_file:
+        log_path = Path(log_file)
+        log_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        file_handler = logging.FileHandler(log_file, encoding='utf-8')
+        file_handler.setLevel(level)
+        file_formatter = logging.Formatter(
+            '%(asctime)s | %(levelname)-8s | %(name)s | %(funcName)s:%(lineno)d | %(message)s',
+            datefmt='%Y-%m-%d %H:%M:%S'
+        )
+        file_handler.setFormatter(file_formatter)
+        logger.addHandler(file_handler)
+    
+    return logger
+
+
+def get_logger(name: str = "diffulex") -> logging.Logger:
+    """
+    Get or create a logger
+    
+    Args:
+        name: Logger name
+        
+    Returns:
+        Logger instance
+    """
+    logger = logging.getLogger(name)
+    if not logger.handlers:
+        # Setup default logger if not already configured
+        setup_logger(name)
+    # Ensure propagate is False to avoid duplicate output
+    logger.propagate = False
+    return logger
+
+
+class LoggerMixin:
+    """Mixin class to add logger property to classes"""
+    
+    @property
+    def logger(self) -> logging.Logger:
+        """Get logger for this class"""
+        return get_logger(self.__class__.__module__)
+
+
+# Add success method to logger
+def _add_success_method():
+    """Add success method to logging.Logger class"""
+    if RICH_AVAILABLE:
+        def success(self, message: str, *args, **kwargs):
+            """Log success message with rich formatting"""
+            self.info(f"[green]✓[/green] {message}", *args, **kwargs)
+    else:
+        def success(self, message: str, *args, **kwargs):
+            """Log success message"""
+            if COLORAMA_AVAILABLE:
+                self.info(f"{Fore.GREEN}✓{Style.RESET_ALL} {message}", *args, **kwargs)
+            else:
+                self.info(f"✓ {message}", *args, **kwargs)
+    
+    if not hasattr(logging.Logger, 'success'):
+        logging.Logger.success = success
+
+
+# Initialize success method
+_add_success_method()
+
diff --git a/diffulex/model/config/dream/configuration_dream.py b/diffulex/model/config/dream/configuration_dream.py
index 6a8c49d..ec83795 100755
--- a/diffulex/model/config/dream/configuration_dream.py
+++ b/diffulex/model/config/dream/configuration_dream.py
@@ -17,10 +17,10 @@
 
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_rope_utils import rope_config_validation
-from transformers.utils import logging
 
+from diffulex.logger import get_logger
 
-logger = logging.get_logger(__name__)
+logger = get_logger(__name__)
 
 
 class DreamConfig(PretrainedConfig):
diff --git a/diffulex/model/config/fast_dllm_v2/configuration_fast_dllm_v2.py b/diffulex/model/config/fast_dllm_v2/configuration_fast_dllm_v2.py
index ab484c6..0b373ac 100755
--- a/diffulex/model/config/fast_dllm_v2/configuration_fast_dllm_v2.py
+++ b/diffulex/model/config/fast_dllm_v2/configuration_fast_dllm_v2.py
@@ -17,10 +17,10 @@
 
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_rope_utils import rope_config_validation
-from transformers.utils import logging
 
+from diffulex.logger import get_logger
 
-logger = logging.get_logger(__name__)
+logger = get_logger(__name__)
 
 
 class FastdLLMV2Config(PretrainedConfig):
diff --git a/diffulex/model/config/sdar/configuration_sdar.py b/diffulex/model/config/sdar/configuration_sdar.py
index f201418..fed2675 100644
--- a/diffulex/model/config/sdar/configuration_sdar.py
+++ b/diffulex/model/config/sdar/configuration_sdar.py
@@ -3,10 +3,10 @@
 
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_rope_utils import rope_config_validation
-from transformers.utils import logging
 
+from diffulex.logger import get_logger
 
-logger = logging.get_logger(__name__)
+logger = get_logger(__name__)
 
 
 class SDARConfig(PretrainedConfig):
diff --git a/diffulex/sampler/base.py b/diffulex/sampler/base.py
index 34f394f..3fec283 100644
--- a/diffulex/sampler/base.py
+++ b/diffulex/sampler/base.py
@@ -7,6 +7,9 @@
 from easydict import EasyDict as edict
 
 from diffulex.engine.sequence import SequenceBase
+from diffulex.logger import get_logger
+
+logger = get_logger(__name__)
 
 
 class SamplerBase(nn.Module):
@@ -93,7 +96,7 @@ def _fetch_last_logits(self, logits: torch.Tensor, seq: SequenceBase) -> torch.T
     
     def _shift_logits(self, logits, last_logit=None):
         if logits.shape[1] == 0:
-            print("Warning: logits sequence length is 0, returning empty logits")
+            logger.warning("Logits sequence length is 0, returning empty logits")
             raise Exception("logits sequence length is 0")
             
         shifted_logits = torch.zeros_like(logits)
diff --git a/diffulex/strategy/fast_dllm_v2/engine/sequence.py b/diffulex/strategy/fast_dllm_v2/engine/sequence.py
index 16453e5..d105a55 100644
--- a/diffulex/strategy/fast_dllm_v2/engine/sequence.py
+++ b/diffulex/strategy/fast_dllm_v2/engine/sequence.py
@@ -16,11 +16,13 @@ class FDV2BlockStatus(Enum):
 
 class FDV2SubBlockStatus(Enum):
     ACTIVE = auto()
+    TO_DUAL_CACHE = auto()
+    IN_DUAL_CACHE = auto()
 
 @dataclass
 class FDV2SubBlock:
-    pass
-
+    sub_block_id: int = 0
+    status: FDV2SubBlockStatus = FDV2SubBlockStatus.ACTIVE
 
 @dataclass
 class FDV2Block:
diff --git a/diffulex/utils/loader.py b/diffulex/utils/loader.py
index b2e7cbe..ffdb689 100755
--- a/diffulex/utils/loader.py
+++ b/diffulex/utils/loader.py
@@ -8,6 +8,9 @@
 from functools import partial
 from safetensors import safe_open
 from diffulex.config import Config
+from diffulex.logger import get_logger
+
+logger = get_logger(__name__)
 
 
 def load_lora_config(lora_path: str) -> dict:
@@ -47,10 +50,10 @@ def load_model(model: nn.Module, config: Config):
     if config.use_lora and config.lora_path:
         lora_config = load_lora_config(config.lora_path)
         if lora_config:
-            print(f"LoRA Config Loaded: {lora_config}")
+            logger.info(f"LoRA Config Loaded: {lora_config}")
             model = enable_lora_for_model(model, lora_config)
         else:
-            print("No adapter_config.json found, using default LoRA parameters")
+            logger.info("No adapter_config.json found, using default LoRA parameters")
             default_config = {'r': 16, 'lora_alpha': 32.0, 'lora_dropout': 0.0}
             model = enable_lora_for_model(model, default_config)
     
@@ -92,12 +95,12 @@ def load_model(model: nn.Module, config: Config):
     # Load LoRA weights if enabled
     if config.use_lora and config.lora_path:
         if os.path.exists(config.lora_path):
-            print(f"Loading LoRA weights from {config.lora_path}")
+            logger.info(f"Loading LoRA weights from {config.lora_path}")
             load_lora_weights_fn = partial(load_lora_weights, model, config.lora_path)
             packed_modules_mapping = packed_modules_mapping if config.model_name == "llada" else None
             model = load_lora_weights_fn(packed_modules_mapping=packed_modules_mapping)
         else:
-            print(f"Warning: LoRA path {config.lora_path} does not exist, skipping LoRA loading")
+            logger.warning(f"LoRA path {config.lora_path} does not exist, skipping LoRA loading")
     
     return model
 
@@ -189,16 +192,16 @@ def load_lora_weights(model: nn.Module, lora_path: str, packed_modules_mapping:
                         module.lora_B.data.copy_(found_b)
                         applied_count += 1
                     except Exception as e:
-                        print(f"Failed to load LoRA weights for {name}: {e}")
+                        logger.warning(f"Failed to load LoRA weights for {name}: {e}")
         
         for module in model.modules():
             if hasattr(module, 'merge_lora'):
                 module.merge_lora()
         
-        print(f"LoRA weights applied to {applied_count} layers and merged")
+        logger.info(f"LoRA weights applied to {applied_count} layers and merged")
         
     except Exception as e:
-        print(f"Error loading LoRA weights: {e}")
-        print("Continuing with base model only")
+        logger.error(f"Error loading LoRA weights: {e}")
+        logger.warning("Continuing with base model only")
     
     return model
diff --git a/diffulex_bench/README.md b/diffulex_bench/README.md
new file mode 100644
index 0000000..158b266
--- /dev/null
+++ b/diffulex_bench/README.md
@@ -0,0 +1,323 @@
+# Diffulex Benchmark
+
+Benchmark framework for evaluating Diffulex inference engine using lm-evaluation-harness.
+
+## Features
+
+- ✅ **lm-evaluation-harness Integration**: Full support for 50+ evaluation tasks
+- ✅ **YAML Configuration**: Clean and readable configuration files
+- ✅ **Professional Logging**: Colored output with rich formatting
+- ✅ **Flexible Configuration**: Support both config files and command-line arguments
+- ✅ **Multiple Models**: Support for Dream, SDAR, Fast-dLLM-v2 models
+- ✅ **Multiple Strategies**: D2F, Block Diffusion, Fast-dLLM decoding strategies
+
+## Quick Start
+
+### Installation
+
+```bash
+# Install dependencies
+pip install lm-eval rich colorama
+
+# Install diffulex (if not already installed)
+pip install -e .
+```
+
+### Using Configuration File (Recommended)
+
+1. **Create or use existing config file**:
+
+```bash
+# Copy example config
+cp diffulex_bench/configs/example.yml my_config.yml
+
+# Edit the config file
+vim my_config.yml
+```
+
+2. **Run benchmark**:
+
+```bash
+python -m diffulex_bench.main --config my_config.yml
+```
+
+### Using Command Line Arguments
+
+```bash
+python -m diffulex_bench.main \
+    --model-path /path/to/model \
+    --model-name dream \
+    --decoding-strategy d2f \
+    --dataset gsm8k \
+    --dataset-limit 100 \
+    --temperature 0.0 \
+    --max-tokens 256 \
+    --output-dir ./results
+```
+
+## Configuration Files
+
+Configuration files are located in `diffulex_bench/configs/` directory. We use YAML format for better readability.
+
+### Configuration Structure
+
+Configurations are organized into two sections:
+
+1. **`engine`**: Engine configuration (model weights, LoRA, model name, strategy, inference parameters)
+2. **`eval`**: Evaluation configuration (dataset, tasks, sampling parameters, output settings)
+
+### Example Configuration
+
+See `diffulex_bench/configs/example.yml` for a complete example:
+
+```yaml
+# Engine configuration - Parameters for Diffulex engine
+engine:
+  # Model and weights
+  model_path: "/path/to/your/model"
+  model_name: "dream"
+  decoding_strategy: "d2f"
+  mask_token_id: 151666
+  
+  # LoRA configuration
+  use_lora: false
+  lora_path: ""
+  
+  # Parallelism and memory
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  gpu_memory_utilization: 0.9
+  max_model_len: 2048
+  
+  # D2F-specific parameters
+  accept_threshold: 0.9
+  complete_threshold: 0.95
+  add_new_block_threshold: 0.1
+
+# Evaluation configuration - Parameters for benchmark
+eval:
+  # Task/Dataset
+  dataset_name: "gsm8k"
+  dataset_limit: 100
+  
+  # Sampling
+  temperature: 0.0
+  max_tokens: 256
+  
+  # Output
+  output_dir: "benchmark_results"
+```
+
+### Pre-configured Examples
+
+- `configs/example.yml`: Complete example with all options
+- `configs/dream_d2f_gsm8k.yml`: Dream model with D2F strategy on GSM8K
+
+## Supported Tasks
+
+The framework supports all tasks available in lm-evaluation-harness, including:
+
+- **GSM8K**: Math word problems
+- **HumanEval**: Code generation
+- **HellaSwag**: Commonsense reasoning
+- **MMLU**: Massive multitask language understanding
+- And 50+ more tasks...
+
+See [lm-evaluation-harness tasks](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_table.md) for the complete list.
+
+## Model Configuration
+
+### Model Types
+
+- `dream`: Dream model
+- `sdar`: SDAR model
+- `fast_dllm_v2`: Fast-dLLM-v2 model
+
+### Decoding Strategies
+
+- `d2f`: Discrete Diffusion Forcing
+- `block_diffusion`: Block Diffusion
+- `fast_dllm`: Fast-dLLM
+
+### Example: Dream with D2F
+
+```yaml
+engine:
+  model_path: "/path/to/dream/model"
+  model_name: "dream"
+  decoding_strategy: "d2f"
+  mask_token_id: 151666
+  accept_threshold: 0.9
+  complete_threshold: 0.95
+  add_new_block_threshold: 0.1
+
+eval:
+  dataset_name: "gsm8k"
+  temperature: 0.0
+  max_tokens: 256
+```
+
+## Command Line Arguments
+
+### Basic Arguments
+
+```bash
+--config PATH              # Configuration file path (YAML or JSON)
+--model-path PATH          # Model path (required if no config)
+--dataset TASK             # Task name (e.g., gsm8k, humaneval)
+--output-dir PATH          # Output directory
+```
+
+### Model Arguments
+
+```bash
+--model-name NAME          # Model name: dream, sdar, fast_dllm_v2
+--decoding-strategy STR    # Strategy: d2f, block_diffusion, fast_dllm
+--mask-token-id ID         # Mask token ID
+```
+
+### Inference Arguments
+
+```bash
+--tensor-parallel-size N   # Tensor parallel size
+--data-parallel-size N     # Data parallel size
+--gpu-memory-utilization F # GPU memory utilization (0.0-1.0)
+--max-model-len N          # Maximum model length
+```
+
+### Sampling Arguments
+
+```bash
+--temperature F            # Sampling temperature
+--max-tokens N             # Maximum tokens to generate
+```
+
+### Logging Arguments
+
+```bash
+--log-file PATH            # Log file path (optional)
+--log-level LEVEL          # Log level: DEBUG, INFO, WARNING, ERROR
+```
+
+## Output
+
+Results are saved to the output directory (default: `benchmark_results/`) with:
+
+- Evaluation results in JSON format
+- Detailed metrics and statistics
+- Configuration used for the run
+- Timestamp information
+
+## Examples
+
+### Example 1: GSM8K Evaluation
+
+```bash
+python -m diffulex_bench.main \
+    --config diffulex_bench/configs/dream_d2f_gsm8k.yml \
+    --dataset-limit 100
+```
+
+### Example 2: Custom Configuration
+
+```bash
+python -m diffulex_bench.main \
+    --model-path /path/to/model \
+    --model-name dream \
+    --decoding-strategy d2f \
+    --dataset gsm8k \
+    --temperature 0.0 \
+    --max-tokens 512 \
+    --output-dir ./my_results \
+    --log-file ./benchmark.log
+```
+
+### Example 3: Using Default Config
+
+```bash
+# If configs/example.yml exists, it will be used automatically
+python -m diffulex_bench.main \
+    --model-path /path/to/model \
+    --dataset gsm8k
+```
+
+## Architecture
+
+```
+main.py (Entry Point)
+    ↓
+arg_parser.py (Argument Parsing)
+    ↓
+config.py (Configuration Management)
+    ↓
+run_benchmark() (Benchmark Execution)
+    ↓
+lm_eval.cli_evaluate() (Evaluation Framework)
+    ↓
+DiffulexLM (Model Interface)
+    ↓
+BenchmarkRunner (Engine Wrapper)
+    ↓
+Diffulex (Inference Engine)
+```
+
+## Advanced Usage
+
+### Custom Model Integration
+
+The framework uses `DiffulexLM` class which wraps `BenchmarkRunner`. You can extend it for custom models:
+
+```python
+from diffulex_bench.lm_eval_model import DiffulexLM
+
+# DiffulexLM automatically registers with lm_eval
+# Use it in lm_eval commands
+```
+
+### Programmatic Usage
+
+```python
+from diffulex_bench.config import BenchmarkConfig, EngineConfig, EvalConfig
+from diffulex_bench.main import run_benchmark
+
+# Load from YAML file
+config = BenchmarkConfig.from_yaml("diffulex_bench/configs/example.yml")
+run_benchmark(config)
+
+# Or create programmatically
+engine = EngineConfig(
+    model_path="/path/to/model",
+    model_name="dream",
+    decoding_strategy="d2f",
+)
+eval_config = EvalConfig(
+    dataset_name="gsm8k",
+    temperature=0.0,
+    max_tokens=256,
+)
+config = BenchmarkConfig(engine=engine, eval=eval_config)
+run_benchmark(config)
+```
+
+## Troubleshooting
+
+### Common Issues
+
+1. **lm-eval not found**: Install with `pip install lm-eval`
+2. **Config file not found**: Check path or use absolute path
+3. **Model loading fails**: Verify model path and model_name match
+4. **Out of memory**: Reduce `gpu_memory_utilization` or `max_model_len`
+
+### Getting Help
+
+- Check logs with `--log-level DEBUG`
+- Save logs to file with `--log-file benchmark.log`
+- Verify configuration with `--config` option
+
+## Notes
+
+1. The framework uses **lm-evaluation-harness** for all evaluation logic
+2. Configuration files use **YAML** format (JSON also supported)
+3. All evaluation metrics are computed by lm-eval
+4. Results follow lm-eval output format
+5. GPU environment is recommended for best performance
diff --git a/diffulex_bench/__init__.py b/diffulex_bench/__init__.py
new file mode 100644
index 0000000..b9a730d
--- /dev/null
+++ b/diffulex_bench/__init__.py
@@ -0,0 +1,36 @@
+"""
+Diffulex Benchmark - Benchmark framework for evaluating Diffulex inference engine performance
+"""
+
+from diffulex_bench.runner import BenchmarkRunner
+from diffulex_bench.datasets import load_benchmark_dataset
+from diffulex_bench.metrics import compute_metrics
+from diffulex.logger import setup_logger, get_logger
+from diffulex_bench.config import BenchmarkConfig, EngineConfig, EvalConfig
+
+# Import lm_eval model to register it
+try:
+    from diffulex_bench.lm_eval_model import DiffulexLM
+    __all__ = [
+        "BenchmarkRunner",
+        "load_benchmark_dataset",
+        "compute_metrics",
+        "setup_logger",
+        "get_logger",
+        "BenchmarkConfig",
+        "EngineConfig",
+        "EvalConfig",
+        "DiffulexLM",
+    ]
+except ImportError:
+    __all__ = [
+        "BenchmarkRunner",
+        "load_benchmark_dataset",
+        "compute_metrics",
+        "setup_logger",
+        "get_logger",
+        "BenchmarkConfig",
+        "EngineConfig",
+        "EvalConfig",
+    ]
+
diff --git a/diffulex_bench/arg_parser.py b/diffulex_bench/arg_parser.py
new file mode 100644
index 0000000..77a2ddb
--- /dev/null
+++ b/diffulex_bench/arg_parser.py
@@ -0,0 +1,260 @@
+"""
+Argument Parser - Command line argument parsing for benchmark
+"""
+
+import argparse
+from pathlib import Path
+
+
+def create_argument_parser() -> argparse.ArgumentParser:
+    """
+    Create and configure argument parser for benchmark
+    
+    Returns:
+        Configured ArgumentParser instance
+    """
+    parser = argparse.ArgumentParser(
+        description="Diffulex Benchmark using lm-evaluation-harness",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Using configuration file (recommended)
+  python -m diffulex_bench.main --config diffulex_bench/configs/example.yml
+
+  # Using command line arguments
+  python -m diffulex_bench.main \\
+    --model-path /path/to/model \\
+    --dataset gsm8k \\
+    --dataset-limit 100 \\
+    --output-dir ./results
+
+  # With custom model settings
+  python -m diffulex_bench.main \\
+    --model-path /path/to/model \\
+    --model-name dream \\
+    --decoding-strategy d2f \\
+    --dataset gsm8k \\
+    --temperature 0.0 \\
+    --max-tokens 256
+        """
+    )
+    
+    # Logging arguments
+    parser.add_argument(
+        "--log-file",
+        type=str,
+        default=None,
+        help="Log file path (optional)",
+    )
+    parser.add_argument(
+        "--log-level",
+        type=str,
+        default="INFO",
+        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
+        help="Logging level",
+    )
+    
+    # Configuration file
+    parser.add_argument(
+        "--config",
+        type=str,
+        help="Configuration file path (YAML or JSON). Default: configs/example.yml",
+    )
+    
+    # Model arguments
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        help="Model path",
+    )
+    parser.add_argument(
+        "--tokenizer-path",
+        type=str,
+        default=None,
+        help="Tokenizer path (defaults to model-path)",
+    )
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="dream",
+        choices=["dream", "sdar", "fast_dllm_v2"],
+        help="Model name",
+    )
+    parser.add_argument(
+        "--decoding-strategy",
+        type=str,
+        default="d2f",
+        choices=["d2f", "block_diffusion", "fast_dllm"],
+        help="Decoding strategy",
+    )
+    parser.add_argument(
+        "--mask-token-id",
+        type=int,
+        default=151666,
+        help="Mask token ID",
+    )
+    
+    # Inference arguments
+    parser.add_argument(
+        "--tensor-parallel-size",
+        type=int,
+        default=1,
+        help="Tensor parallel size",
+    )
+    parser.add_argument(
+        "--data-parallel-size",
+        type=int,
+        default=1,
+        help="Data parallel size",
+    )
+    parser.add_argument(
+        "--gpu-memory-utilization",
+        type=float,
+        default=0.9,
+        help="GPU memory utilization",
+    )
+    parser.add_argument(
+        "--max-model-len",
+        type=int,
+        default=2048,
+        help="Maximum model length",
+    )
+    parser.add_argument(
+        "--max-num-batched-tokens",
+        type=int,
+        default=4096,
+        help="Maximum number of batched tokens",
+    )
+    parser.add_argument(
+        "--max-num-seqs",
+        type=int,
+        default=128,
+        help="Maximum number of sequences",
+    )
+    
+    # Sampling arguments
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.0,
+        help="Sampling temperature",
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=256,
+        help="Maximum tokens to generate",
+    )
+    parser.add_argument(
+        "--ignore-eos",
+        action="store_true",
+        help="Ignore EOS token",
+    )
+    
+    # Dataset arguments
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="gsm8k",
+        help="Dataset/task name (e.g., gsm8k, humaneval)",
+    )
+    parser.add_argument(
+        "--dataset-split",
+        type=str,
+        default="test",
+        help="Dataset split",
+    )
+    parser.add_argument(
+        "--dataset-limit",
+        type=int,
+        default=None,
+        help="Limit number of samples",
+    )
+    
+    # Output arguments
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="benchmark_results",
+        help="Output directory",
+    )
+    parser.add_argument(
+        "--save-results",
+        action="store_true",
+        default=True,
+        help="Save results to file",
+    )
+    parser.add_argument(
+        "--no-save-results",
+        dest="save_results",
+        action="store_false",
+        help="Do not save results to file",
+    )
+    
+    # LoRA arguments
+    parser.add_argument(
+        "--use-lora",
+        action="store_true",
+        help="Use LoRA",
+    )
+    parser.add_argument(
+        "--lora-path",
+        type=str,
+        default="",
+        help="LoRA path",
+    )
+    
+    # Engine arguments
+    parser.add_argument(
+        "--enforce-eager",
+        action="store_true",
+        help="Enforce eager mode (disable CUDA graphs)",
+    )
+    parser.add_argument(
+        "--kv-cache-layout",
+        type=str,
+        default="unified",
+        choices=["unified", "distinct"],
+        help="KV cache layout",
+    )
+    
+    # D2F-specific arguments
+    parser.add_argument(
+        "--accept-threshold",
+        type=float,
+        default=0.9,
+        help="Accept threshold for D2F",
+    )
+    parser.add_argument(
+        "--complete-threshold",
+        type=float,
+        default=0.95,
+        help="Complete threshold for D2F",
+    )
+    parser.add_argument(
+        "--add-new-block-threshold",
+        type=float,
+        default=0.1,
+        help="Add new block threshold for D2F",
+    )
+    parser.add_argument(
+        "--diffusion-block-size",
+        type=int,
+        default=32,
+        help="Diffusion block size",
+    )
+    
+    return parser
+
+
+def get_default_config_path() -> Path:
+    """
+    Get default configuration file path
+    
+    Returns:
+        Path to default config file
+    """
+    config_dir = Path(__file__).parent / "configs"
+    default_config = config_dir / "example.yml"
+    return default_config
+
diff --git a/diffulex_bench/config.py b/diffulex_bench/config.py
new file mode 100644
index 0000000..90ea260
--- /dev/null
+++ b/diffulex_bench/config.py
@@ -0,0 +1,245 @@
+"""
+Benchmark Configuration - Configuration management with separated engine and eval configs
+"""
+
+from dataclasses import dataclass, field
+from typing import Optional, Dict, Any
+import json
+import yaml
+
+
+@dataclass
+class EngineConfig:
+    """
+    Engine configuration - Parameters for Diffulex engine initialization
+    """
+    # Model and weights
+    model_path: str
+    tokenizer_path: Optional[str] = None
+    model_name: str = "dream"  # Options: dream, sdar, fast_dllm_v2
+    decoding_strategy: str = "d2f"  # Options: d2f, block_diffusion, fast_dllm
+    mask_token_id: int = 151666
+    
+    # LoRA configuration
+    use_lora: bool = False
+    lora_path: str = ""
+    
+    # Parallelism configuration
+    tensor_parallel_size: int = 1
+    data_parallel_size: int = 1
+    
+    # Memory and capacity configuration
+    gpu_memory_utilization: float = 0.9
+    max_model_len: int = 2048
+    max_num_batched_tokens: int = 4096
+    max_num_seqs: int = 128
+    
+    # Engine behavior configuration
+    enforce_eager: bool = False
+    kv_cache_layout: str = "unified"  # Options: unified, distinct
+    
+    # D2F-specific configuration
+    accept_threshold: float = 0.9
+    complete_threshold: float = 0.95
+    add_new_block_threshold: float = 0.1
+    diffusion_block_size: int = 32
+    
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any]) -> "EngineConfig":
+        """Create engine configuration from dictionary"""
+        return cls(**config_dict)
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return {
+            field.name: getattr(self, field.name)
+            for field in self.__dataclass_fields__.values()
+        }
+    
+    def get_diffulex_kwargs(self) -> Dict[str, Any]:
+        """Get arguments to pass to Diffulex engine"""
+        return {
+            'model_name': self.model_name,
+            'decoding_strategy': self.decoding_strategy,
+            'mask_token_id': self.mask_token_id,
+            'tensor_parallel_size': self.tensor_parallel_size,
+            'data_parallel_size': self.data_parallel_size,
+            'gpu_memory_utilization': self.gpu_memory_utilization,
+            'max_model_len': self.max_model_len,
+            'max_num_batched_tokens': self.max_num_batched_tokens,
+            'max_num_seqs': self.max_num_seqs,
+            'use_lora': self.use_lora,
+            'lora_path': self.lora_path if self.use_lora else "",
+            'enforce_eager': self.enforce_eager,
+            'kv_cache_layout': self.kv_cache_layout,
+            'accept_threshold': self.accept_threshold,
+            'complete_threshold': self.complete_threshold,
+            'add_new_block_threshold': self.add_new_block_threshold,
+            'diffusion_block_size': self.diffusion_block_size,
+        }
+
+
+@dataclass
+class EvalConfig:
+    """
+    Evaluation configuration - Parameters for benchmark evaluation
+    """
+    # Task/Dataset configuration
+    dataset_name: str = "gsm8k"  # Task name (e.g., gsm8k, humaneval)
+    dataset_split: str = "test"
+    dataset_limit: Optional[int] = None
+    
+    # Sampling configuration
+    temperature: float = 0.0
+    max_tokens: int = 256
+    ignore_eos: bool = False
+    
+    # Output configuration
+    output_dir: str = "benchmark_results"
+    save_results: bool = True
+    use_tqdm: bool = True
+    
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any]) -> "EvalConfig":
+        """Create evaluation configuration from dictionary"""
+        return cls(**config_dict)
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return {
+            field.name: getattr(self, field.name)
+            for field in self.__dataclass_fields__.values()
+        }
+    
+    def get_sampling_params(self):
+        """Get sampling parameters"""
+        from diffulex import SamplingParams
+        return SamplingParams(
+            temperature=self.temperature,
+            max_tokens=self.max_tokens,
+            ignore_eos=self.ignore_eos,
+        )
+
+
+@dataclass
+class BenchmarkConfig:
+    """
+    Benchmark configuration - Combines engine and evaluation configurations
+    """
+    engine: EngineConfig
+    eval: EvalConfig
+    
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any]) -> "BenchmarkConfig":
+        """
+        Create benchmark configuration from dictionary
+        
+        Supports both flat and nested dictionary structures for backward compatibility
+        """
+        # Check if config_dict has nested structure
+        if 'engine' in config_dict and 'eval' in config_dict:
+            engine = EngineConfig.from_dict(config_dict['engine'])
+            eval_config = EvalConfig.from_dict(config_dict['eval'])
+        else:
+            # Flat structure - backward compatibility
+            # Split fields into engine and eval
+            engine_fields = {
+                'model_path', 'tokenizer_path', 'model_name', 'decoding_strategy',
+                'mask_token_id', 'use_lora', 'lora_path', 'tensor_parallel_size',
+                'data_parallel_size', 'gpu_memory_utilization', 'max_model_len',
+                'max_num_batched_tokens', 'max_num_seqs', 'enforce_eager',
+                'kv_cache_layout', 'accept_threshold', 'complete_threshold',
+                'add_new_block_threshold', 'diffusion_block_size'
+            }
+            
+            engine_dict = {k: v for k, v in config_dict.items() if k in engine_fields}
+            eval_dict = {k: v for k, v in config_dict.items() if k not in engine_fields}
+            
+            engine = EngineConfig.from_dict(engine_dict)
+            eval_config = EvalConfig.from_dict(eval_dict)
+        
+        return cls(engine=engine, eval=eval_config)
+    
+    @classmethod
+    def from_json(cls, json_path: str) -> "BenchmarkConfig":
+        """Load configuration from JSON file"""
+        with open(json_path, 'r', encoding='utf-8') as f:
+            config_dict = json.load(f)
+        return cls.from_dict(config_dict)
+    
+    @classmethod
+    def from_yaml(cls, yaml_path: str) -> "BenchmarkConfig":
+        """Load configuration from YAML file"""
+        with open(yaml_path, 'r', encoding='utf-8') as f:
+            config_dict = yaml.safe_load(f)
+        return cls.from_dict(config_dict)
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary with nested structure"""
+        return {
+            'engine': self.engine.to_dict(),
+            'eval': self.eval.to_dict(),
+        }
+    
+    def save_json(self, json_path: str):
+        """Save to JSON file"""
+        with open(json_path, 'w', encoding='utf-8') as f:
+            json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)
+    
+    def save_yaml(self, yaml_path: str):
+        """Save to YAML file"""
+        with open(yaml_path, 'w', encoding='utf-8') as f:
+            yaml.dump(self.to_dict(), f, allow_unicode=True, default_flow_style=False)
+    
+    def get_diffulex_kwargs(self) -> Dict[str, Any]:
+        """Get arguments to pass to Diffulex engine"""
+        return self.engine.get_diffulex_kwargs()
+    
+    def get_sampling_params(self):
+        """Get sampling parameters"""
+        return self.eval.get_sampling_params()
+    
+    # Convenience properties for backward compatibility
+    @property
+    def model_path(self) -> str:
+        return self.engine.model_path
+    
+    @property
+    def tokenizer_path(self) -> Optional[str]:
+        return self.engine.tokenizer_path
+    
+    @property
+    def model_name(self) -> str:
+        return self.engine.model_name
+    
+    @property
+    def decoding_strategy(self) -> str:
+        return self.engine.decoding_strategy
+    
+    @property
+    def dataset_name(self) -> str:
+        return self.eval.dataset_name
+    
+    @property
+    def dataset_limit(self) -> Optional[int]:
+        return self.eval.dataset_limit
+    
+    @property
+    def output_dir(self) -> str:
+        return self.eval.output_dir
+    
+    @dataset_name.setter
+    def dataset_name(self, value: str):
+        self.eval.dataset_name = value
+    
+    @dataset_limit.setter
+    def dataset_limit(self, value: Optional[int]):
+        self.eval.dataset_limit = value
+    
+    @output_dir.setter
+    def output_dir(self, value: str):
+        self.eval.output_dir = value
+    
+    @model_path.setter
+    def model_path(self, value: str):
+        self.engine.model_path = value
diff --git a/diffulex_bench/configs/__init__.py b/diffulex_bench/configs/__init__.py
new file mode 100644
index 0000000..51b7ec8
--- /dev/null
+++ b/diffulex_bench/configs/__init__.py
@@ -0,0 +1,4 @@
+"""
+Configuration files for Diffulex benchmarks
+"""
+
diff --git a/diffulex_bench/configs/dream_d2f_gsm8k.yml b/diffulex_bench/configs/dream_d2f_gsm8k.yml
new file mode 100644
index 0000000..e55b9be
--- /dev/null
+++ b/diffulex_bench/configs/dream_d2f_gsm8k.yml
@@ -0,0 +1,29 @@
+# Dream model with D2F strategy on GSM8K dataset
+# Quick configuration example
+
+engine:
+  model_path: "/path/to/dream/model"
+  model_name: "dream"
+  decoding_strategy: "d2f"
+  mask_token_id: 151666
+  
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  gpu_memory_utilization: 0.9
+  max_model_len: 2048
+  
+  use_lora: false
+  enforce_eager: false
+  
+  accept_threshold: 0.9
+  complete_threshold: 0.95
+  add_new_block_threshold: 0.1
+
+eval:
+  dataset_name: "gsm8k"
+  dataset_limit: 100
+  
+  temperature: 0.0
+  max_tokens: 256
+  
+  output_dir: "benchmark_results"
diff --git a/diffulex_bench/configs/example.yml b/diffulex_bench/configs/example.yml
new file mode 100644
index 0000000..26d96d1
--- /dev/null
+++ b/diffulex_bench/configs/example.yml
@@ -0,0 +1,52 @@
+# Diffulex Benchmark Configuration Example
+# This configuration uses nested structure with engine and eval sections
+
+# Engine configuration - Parameters for Diffulex engine initialization
+engine:
+  # Model and weights
+  model_path: "/path/to/your/model"
+  tokenizer_path: null  # Optional, defaults to model_path
+  model_name: "dream"  # Options: dream, sdar, fast_dllm_v2
+  decoding_strategy: "d2f"  # Options: d2f, block_diffusion, fast_dllm
+  mask_token_id: 151666
+  
+  # LoRA configuration
+  use_lora: false
+  lora_path: ""
+  
+  # Parallelism configuration
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  
+  # Memory and capacity configuration
+  gpu_memory_utilization: 0.9
+  max_model_len: 2048
+  max_num_batched_tokens: 4096
+  max_num_seqs: 128
+  
+  # Engine behavior configuration
+  enforce_eager: false
+  kv_cache_layout: "unified"  # Options: unified, distinct
+  
+  # D2F-specific configuration
+  accept_threshold: 0.9
+  complete_threshold: 0.95
+  add_new_block_threshold: 0.1
+  diffusion_block_size: 32
+
+# Evaluation configuration - Parameters for benchmark evaluation
+eval:
+  # Task/Dataset configuration
+  dataset_name: "gsm8k"  # Options: gsm8k, humaneval, etc.
+  dataset_split: "test"
+  dataset_limit: 100  # Optional, limit number of samples
+  
+  # Sampling configuration
+  temperature: 0.0
+  max_tokens: 256
+  ignore_eos: false
+  
+  # Output configuration
+  output_dir: "benchmark_results"
+  save_results: true
+  use_tqdm: true
diff --git a/diffulex_bench/datasets.py b/diffulex_bench/datasets.py
new file mode 100644
index 0000000..3a882cf
--- /dev/null
+++ b/diffulex_bench/datasets.py
@@ -0,0 +1,118 @@
+"""
+Benchmark Datasets - Dataset loaders for benchmark evaluation
+Supports common evaluation datasets such as GSM8K, HumanEval, etc.
+"""
+
+from typing import List, Dict, Any, Optional, Callable
+from datasets import load_dataset
+
+
+def load_gsm8k(
+    split: str = "test",
+    limit: Optional[int] = None,
+    prompt_template: Optional[Callable[[str], str]] = None,
+) -> List[Dict[str, Any]]:
+    """
+    Load GSM8K dataset
+    
+    Args:
+        split: Dataset split, default "test"
+        limit: Limit number of samples, None means all
+        prompt_template: Prompt template function that takes question string and returns full prompt
+        
+    Returns:
+        List of dataset items, each containing 'prompt' and 'answer' fields
+    """
+    dataset = load_dataset("gsm8k", "main", split=split)
+    
+    if limit:
+        dataset = dataset[:limit]
+    
+    results = []
+    for item in dataset:
+        question = item["question"]
+        answer = item["answer"]
+        
+        if prompt_template:
+            prompt = prompt_template(question)
+        else:
+            # Default template
+            prompt = f"Question: {question}\nAnswer:"
+        
+        results.append({
+            'prompt': prompt,
+            'answer': answer,
+            'question': question,
+        })
+    
+    return results
+
+
+def load_humaneval(
+    limit: Optional[int] = None,
+    prompt_template: Optional[Callable[[str], str]] = None,
+) -> List[Dict[str, Any]]:
+    """
+    Load HumanEval dataset
+    
+    Args:
+        limit: Limit number of samples, None means all
+        prompt_template: Prompt template function that takes prompt string and returns full prompt
+        
+    Returns:
+        List of dataset items, each containing 'prompt', 'test', 'entry_point' fields
+    """
+    dataset = load_dataset("openai/humaneval", split="test")
+    
+    if limit:
+        dataset = dataset[:limit]
+    
+    results = []
+    for item in dataset:
+        prompt = item["prompt"]
+        test = item["test"]
+        entry_point = item["entry_point"]
+        
+        if prompt_template:
+            full_prompt = prompt_template(prompt)
+        else:
+            full_prompt = prompt
+        
+        results.append({
+            'prompt': full_prompt,
+            'original_prompt': prompt,
+            'test': test,
+            'entry_point': entry_point,
+            'task_id': item.get('task_id', ''),
+        })
+    
+    return results
+
+
+def load_benchmark_dataset(
+    dataset_name: str,
+    **kwargs
+) -> List[Dict[str, Any]]:
+    """
+    Unified dataset loading interface
+    
+    Args:
+        dataset_name: Dataset name, supports "gsm8k", "humaneval"
+        **kwargs: Arguments passed to the specific dataset loader
+        
+    Returns:
+        List of dataset items
+    """
+    loaders = {
+        'gsm8k': load_gsm8k,
+        'humaneval': load_humaneval,
+    }
+    
+    if dataset_name not in loaders:
+        raise ValueError(
+            f"Unknown dataset: {dataset_name}. "
+            f"Supported datasets: {list(loaders.keys())}"
+        )
+    
+    return loaders[dataset_name](**kwargs)
+
diff --git a/diffulex_bench/lm_eval_model.py b/diffulex_bench/lm_eval_model.py
new file mode 100644
index 0000000..2b1c0a5
--- /dev/null
+++ b/diffulex_bench/lm_eval_model.py
@@ -0,0 +1,317 @@
+"""
+LM Eval Model - Diffulex integration with lm-evaluation-harness
+"""
+
+import logging
+import time
+import json
+from typing import List, Optional, Tuple, Type, TypeVar, Union
+from pathlib import Path
+
+from lm_eval import utils
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+
+from diffulex import Diffulex, SamplingParams
+from diffulex_bench.runner import BenchmarkRunner
+from diffulex.logger import setup_logger, get_logger
+
+T = TypeVar("T", bound="LM")
+eval_logger = logging.getLogger(__name__)
+
+
+@register_model("diffulex")
+class DiffulexLM(LM):
+    """
+    Diffulex model integration for lm-evaluation-harness
+    """
+    
+    def __init__(
+        self,
+        pretrained: str,
+        batch_size: Optional[Union[int, str]] = 1,
+        device: Optional[str] = "cuda",
+        dtype: Optional[Union[str, type]] = "auto",
+        max_new_tokens: Optional[int] = 256,
+        max_length: Optional[int] = 2048,
+        add_bos_token: Optional[bool] = False,
+        trust_remote_code: Optional[bool] = True,
+        temperature: Optional[float] = 0.0,
+        model_name: Optional[str] = "dream",
+        decoding_strategy: Optional[str] = "d2f",
+        mask_token_id: Optional[int] = 151666,
+        tensor_parallel_size: Optional[int] = 1,
+        data_parallel_size: Optional[int] = 1,
+        gpu_memory_utilization: Optional[float] = 0.9,
+        max_model_len: Optional[int] = 2048,
+        max_num_batched_tokens: Optional[int] = 4096,
+        max_num_seqs: Optional[int] = 128,
+        use_lora: Optional[bool] = False,
+        lora_path: Optional[str] = "",
+        enforce_eager: Optional[bool] = False,
+        kv_cache_layout: Optional[str] = "unified",
+        accept_threshold: Optional[float] = 0.9,
+        complete_threshold: Optional[float] = 0.95,
+        add_new_block_threshold: Optional[float] = 0.1,
+        diffusion_block_size: Optional[int] = 32,
+        save_dir: Optional[str] = None,
+        wait_ready: Optional[bool] = True,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        
+        # Setup logger
+        self.logger = get_logger(__name__)
+        
+        assert isinstance(pretrained, str)
+        assert isinstance(batch_size, (int, str))
+        
+        self.pretrained = pretrained
+        self.batch_size_per_gpu = batch_size
+        if isinstance(batch_size, str):
+            self.batch_size_per_gpu = int(batch_size)
+        
+        self.max_length = max_length
+        self.add_bos_token = add_bos_token
+        self.max_new_tokens = max_new_tokens
+        self.temperature = temperature
+        self.save_dir = save_dir
+        
+        # Diffulex-specific parameters
+        self.model_name = model_name
+        self.decoding_strategy = decoding_strategy
+        self.mask_token_id = mask_token_id
+        
+        # Statistics tracking
+        self.total_generated_tokens = 0
+        self.total_nfe = 0  # Number of Forward Evaluations (diffusion steps)
+        self.total_generation_time = 0.0
+        self.total_samples = 0
+        self.all_generation_times = []
+        self.all_nfe = []
+        self.all_tokens = []
+        
+        # Initialize Diffulex runner
+        self.runner = BenchmarkRunner(
+            model_path=pretrained,
+            tokenizer_path=pretrained,
+            wait_ready=wait_ready,
+            model_name=model_name,
+            decoding_strategy=decoding_strategy,
+            mask_token_id=mask_token_id,
+            tensor_parallel_size=tensor_parallel_size,
+            data_parallel_size=data_parallel_size,
+            gpu_memory_utilization=gpu_memory_utilization,
+            max_model_len=max_model_len,
+            max_num_batched_tokens=max_num_batched_tokens,
+            max_num_seqs=max_num_seqs,
+            use_lora=use_lora,
+            lora_path=lora_path if use_lora else "",
+            enforce_eager=enforce_eager,
+            kv_cache_layout=kv_cache_layout,
+            accept_threshold=accept_threshold,
+            complete_threshold=complete_threshold,
+            add_new_block_threshold=add_new_block_threshold,
+            diffusion_block_size=diffusion_block_size,
+        )
+        
+        self.tokenizer = self.runner.tokenizer
+        
+        # Create sampling params
+        self.sampling_params = SamplingParams(
+            temperature=temperature,
+            max_tokens=max_new_tokens,
+        )
+        
+        self.logger.success("Diffulex engine initialized successfully")
+    
+    @property
+    def batch_size(self):
+        return self.batch_size_per_gpu
+    
+    @property
+    def device(self):
+        return "cuda"  # Diffulex manages device internally
+    
+    @property
+    def rank(self):
+        return 0
+    
+    @property
+    def world_size(self):
+        return 1
+    
+    def tok_decode(self, tokens, skip_special_tokens=True):
+        """Decode tokens to text"""
+        if isinstance(tokens, list) and len(tokens) > 0 and isinstance(tokens[0], list):
+            return [self.tokenizer.decode(t, skip_special_tokens=skip_special_tokens) for t in tokens]
+        return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
+    
+    def tok_encode(self, text, add_special_tokens=True):
+        """Encode text to tokens"""
+        return self.tokenizer(
+            text, return_tensors="pt", add_special_tokens=add_special_tokens
+        ).input_ids
+    
+    @classmethod
+    def create_from_arg_string(
+        cls: Type[T], arg_string: str, additional_config: Optional[dict] = None
+    ) -> T:
+        """
+        Creates an instance of the LM class using the given argument string and additional config.
+        
+        Args:
+            arg_string: A string containing arguments in the format key1=value1,key2=value2
+            additional_config: Optional dictionary containing additional configuration parameters
+            
+        Returns:
+            Instance of the LM class
+        """
+        additional_config = {} if additional_config is None else additional_config
+        args = utils.simple_parse_args_string(arg_string)
+        args2 = {k: v for k, v in additional_config.items() if v is not None}
+        return cls(**args, **args2)
+    
+    def apply_chat_template(
+        self, chat_history, add_generation_prompt: bool = True
+    ) -> str:
+        """
+        Apply a chat template to a list of chat history between user and model.
+        """
+        chat_templated = self.tokenizer.apply_chat_template(
+            chat_history,
+            tokenize=False,
+            add_generation_prompt=add_generation_prompt,
+            continue_final_message=not add_generation_prompt,
+        )
+        return chat_templated
+    
+    @property
+    def tokenizer_name(self) -> str:
+        return self.tokenizer.name_or_path.replace("/", "__")
+    
+    def generate_until(self, requests: List[Instance], disable_tqdm: bool = False):
+        """
+        Generate text until stopping conditions are met.
+        
+        Args:
+            requests: List of generation requests
+            disable_tqdm: Whether to disable progress bar
+            
+        Returns:
+            List of generated texts
+        """
+        self.logger.info(f"Processing {len(requests)} generation requests...")
+        
+        # Prepare prompts
+        prompts = []
+        gen_args = []
+        
+        for req in requests:
+            prompt = req.arguments[0]
+            if self.add_bos_token and self.tokenizer.bos_token:
+                prompt = self.tokenizer.bos_token + prompt
+            prompts.append(prompt)
+            gen_args.append(req.arguments[1] if len(req.arguments) > 1 else {})
+        
+        # Run generation
+        start_time = time.time()
+        outputs = self.runner.generate(
+            prompts,
+            self.sampling_params,
+            use_tqdm=not disable_tqdm,
+        )
+        end_time = time.time()
+        
+        total_time = end_time - start_time
+        
+        # Extract results and accumulate statistics
+        results = []
+        num_tokens = 0
+        num_nfe = 0
+        
+        for output in outputs:
+            text = output.get('text', '')
+            results.append(text)
+            
+            token_ids = output.get('token_ids', [])
+            n_diff_steps = output.get('n_diff_steps', 0)
+            
+            num_tokens += len(token_ids)
+            num_nfe += n_diff_steps
+            
+            self.all_generation_times.append(total_time / len(outputs) if outputs else 0)
+            self.all_nfe.append(n_diff_steps)
+            self.all_tokens.append(len(token_ids))
+        
+        # Update statistics
+        self.total_samples += len(requests)
+        self.total_generated_tokens += num_tokens
+        self.total_nfe += num_nfe
+        self.total_generation_time += total_time
+        
+        # Log statistics
+        if self.total_samples > 0:
+            avg_tokens = self.total_generated_tokens / self.total_samples
+            avg_nfe = self.total_nfe / self.total_samples
+            avg_time = self.total_generation_time / self.total_samples
+            throughput = num_tokens / total_time if total_time > 0 else 0
+            
+            self.logger.info(
+                f"Generated {len(results)} samples | "
+                f"Tokens: {num_tokens} | "
+                f"NFE: {num_nfe} | "
+                f"Time: {total_time:.2f}s | "
+                f"Throughput: {throughput:.2f} tok/s"
+            )
+        
+        # Save statistics if save_dir is provided
+        if self.save_dir is not None:
+            self._save_statistics()
+        
+        return results
+    
+    def _save_statistics(self):
+        """Save statistics to file"""
+        import os
+        os.makedirs(self.save_dir, exist_ok=True)
+        
+        stats = {
+            'total_samples': self.total_samples,
+            'total_tokens': self.total_generated_tokens,
+            'total_nfe': self.total_nfe,
+            'total_time': self.total_generation_time,
+            'avg_tokens_per_sample': self.total_generated_tokens / self.total_samples if self.total_samples > 0 else 0,
+            'avg_nfe_per_sample': self.total_nfe / self.total_samples if self.total_samples > 0 else 0,
+            'avg_time_per_sample': self.total_generation_time / self.total_samples if self.total_samples > 0 else 0,
+            'throughput_tok_s': self.total_generated_tokens / self.total_generation_time if self.total_generation_time > 0 else 0,
+            'nfe_per_token': self.total_nfe / self.total_generated_tokens if self.total_generated_tokens > 0 else 0,
+            'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
+        }
+        
+        stats_path = os.path.join(self.save_dir, 'diffulex_stats.json')
+        with open(stats_path, 'w', encoding='utf-8') as f:
+            json.dump(stats, f, indent=2, ensure_ascii=False)
+        
+        self.logger.info(f"Statistics saved to {stats_path}")
+    
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        """
+        Compute log-likelihood of continuations given contexts.
+        
+        Note: This is a placeholder implementation. Full loglikelihood computation
+        for diffusion models requires special handling.
+        """
+        self.logger.warning(
+            "loglikelihood computation for diffusion models is not fully implemented. "
+            "Returning placeholder values."
+        )
+        return [(0.0, False) for _ in requests]
+    
+    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+        """Compute log-likelihood of sequences."""
+        raise NotImplementedError(
+            "loglikelihood_rolling is not implemented for diffusion models"
+        )
+
diff --git a/diffulex_bench/logger.py b/diffulex_bench/logger.py
new file mode 100644
index 0000000..444ee65
--- /dev/null
+++ b/diffulex_bench/logger.py
@@ -0,0 +1,16 @@
+"""
+Logger module for diffulex_bench - Re-exports from diffulex.logger
+"""
+
+# Re-export logger functionality from diffulex core package
+from diffulex.logger import (
+    setup_logger,
+    get_logger,
+    LoggerMixin,
+)
+
+__all__ = [
+    "setup_logger",
+    "get_logger",
+    "LoggerMixin",
+]
diff --git a/diffulex_bench/main.py b/diffulex_bench/main.py
new file mode 100644
index 0000000..1c04cce
--- /dev/null
+++ b/diffulex_bench/main.py
@@ -0,0 +1,262 @@
+"""
+Benchmark Main Entry - Main entry point for benchmark using lm-evaluation-harness
+"""
+
+import sys
+import logging
+from pathlib import Path
+
+from diffulex_bench.config import BenchmarkConfig, EngineConfig, EvalConfig
+from diffulex.logger import setup_logger, get_logger
+from diffulex_bench.arg_parser import create_argument_parser, get_default_config_path
+
+try:
+    from lm_eval.__main__ import cli_evaluate
+except ImportError:
+    cli_evaluate = None
+
+
+def config_to_model_args(config: BenchmarkConfig) -> str:
+    """
+    Convert BenchmarkConfig to lm_eval model_args string format
+    
+    Args:
+        config: Benchmark configuration
+        
+    Returns:
+        Model arguments string in key=value format
+    """
+    engine = config.engine
+    eval_config = config.eval
+    
+    args_dict = {
+        'pretrained': engine.model_path,
+        'model_name': engine.model_name,
+        'decoding_strategy': engine.decoding_strategy,
+        'mask_token_id': engine.mask_token_id,
+        'tensor_parallel_size': engine.tensor_parallel_size,
+        'data_parallel_size': engine.data_parallel_size,
+        'gpu_memory_utilization': engine.gpu_memory_utilization,
+        'max_model_len': engine.max_model_len,
+        'max_num_batched_tokens': engine.max_num_batched_tokens,
+        'max_num_seqs': engine.max_num_seqs,
+        'temperature': eval_config.temperature,
+        'max_new_tokens': eval_config.max_tokens,
+        'use_lora': engine.use_lora,
+        'enforce_eager': engine.enforce_eager,
+        'kv_cache_layout': engine.kv_cache_layout,
+        'accept_threshold': engine.accept_threshold,
+        'complete_threshold': engine.complete_threshold,
+        'add_new_block_threshold': engine.add_new_block_threshold,
+        'diffusion_block_size': engine.diffusion_block_size,
+        'wait_ready': True,
+    }
+    
+    if engine.tokenizer_path:
+        args_dict['tokenizer_path'] = engine.tokenizer_path
+    
+    if engine.use_lora and engine.lora_path:
+        args_dict['lora_path'] = engine.lora_path
+    
+    # Convert to string format: key1=value1,key2=value2
+    args_list = [f"{k}={v}" for k, v in args_dict.items()]
+    return ','.join(args_list)
+
+
+def dataset_name_to_tasks(dataset_name: str) -> str:
+    """
+    Convert dataset name to lm_eval task name
+    
+    Args:
+        dataset_name: Dataset name (e.g., "gsm8k", "humaneval")
+        
+    Returns:
+        lm_eval task name
+    """
+    mapping = {
+        'gsm8k': 'gsm8k',
+        'humaneval': 'humaneval',
+    }
+    return mapping.get(dataset_name, dataset_name)
+
+
+def run_benchmark(config: BenchmarkConfig) -> None:
+    """
+    Run benchmark using lm-evaluation-harness
+    
+    Args:
+        config: Benchmark configuration
+    """
+    logger = get_logger(__name__)
+    
+    if cli_evaluate is None:
+        logger.error(
+            "lm-evaluation-harness is not installed. "
+            "Please install it with: pip install lm-eval"
+        )
+        sys.exit(1)
+    
+    benchmark_info = [
+        '=' * 80, 
+        'Diffulex Benchmark (using lm-evaluation-harness)',
+        '=' * 80,
+        f'Model: {config.engine.model_path}',
+        f'Model Name: {config.engine.model_name}',
+        f'Decoding Strategy: {config.engine.decoding_strategy}',
+        f'Tasks: {config.eval.dataset_name}',
+        f'Output Directory: {config.eval.output_dir}',
+        '=' * 80,
+    ]
+    logger.info('\n'.join(benchmark_info))
+    
+    # Convert config to lm_eval arguments
+    model_args = config_to_model_args(config)
+    tasks = dataset_name_to_tasks(config.eval.dataset_name)
+    
+    # Prepare sys.argv for lm_eval
+    original_argv = sys.argv.copy()
+    
+    try:
+        sys.argv = [
+            "lm_eval",
+            "--model", "diffulex",
+            "--model_args", model_args,
+            "--tasks", tasks,
+            "--batch_size", "1",
+            "--output_path", config.eval.output_dir,
+        ]
+        
+        if config.eval.dataset_limit:
+            sys.argv.extend(["--limit", str(config.eval.dataset_limit)])
+        
+        # Add any additional lm_eval arguments from config if needed
+        # For now, we use default batch_size=1
+        
+        lm_eval_info = [
+            '=' * 80,
+            'Starting lm-evaluation-harness evaluation...',
+            '=' * 80,
+            f'Model args: {model_args}',
+            f'Tasks: {tasks}',
+            '=' * 80,
+        ]
+        logger.info('\n'.join(lm_eval_info))
+        
+        # Run lm_eval
+        cli_evaluate()
+        
+        logger.success("Evaluation completed successfully")
+        
+    except Exception as e:
+        logger.error(f"Evaluation failed: {e}", exc_info=True)
+        sys.exit(1)
+    finally:
+        # Restore original argv
+        sys.argv = original_argv
+
+
+def load_config_from_args(args) -> BenchmarkConfig:
+    """
+    Load configuration from command line arguments
+    
+    Args:
+        args: Parsed command line arguments
+        
+    Returns:
+        BenchmarkConfig instance
+    """
+    logger = get_logger(__name__)
+    
+    # Try to load from config file
+    if args.config:
+        config_path = Path(args.config)
+    else:
+        # Try default config path
+        default_config = get_default_config_path()
+        if default_config.exists():
+            config_path = default_config
+            logger.info(f"Using default config: {config_path}")
+        else:
+            config_path = None
+    
+    if config_path and config_path.exists():
+        if config_path.suffix in ['.yaml', '.yml']:
+            config = BenchmarkConfig.from_yaml(str(config_path))
+        elif config_path.suffix == '.json':
+            config = BenchmarkConfig.from_json(str(config_path))
+        else:
+            logger.error(f"Unsupported config file format: {config_path.suffix}")
+            sys.exit(1)
+        logger.info(f"Loaded configuration from: {config_path}")
+        
+        # Override with command line arguments if provided
+        if args.model_path:
+            config.engine.model_path = args.model_path
+        if args.dataset:
+            config.eval.dataset_name = args.dataset
+        if args.dataset_limit is not None:
+            config.eval.dataset_limit = args.dataset_limit
+        if args.output_dir:
+            config.eval.output_dir = args.output_dir
+    else:
+        if not args.model_path:
+            logger.error("Either --config or --model-path must be provided")
+            sys.exit(1)
+        
+        # Create config from command line arguments
+        engine = EngineConfig(
+            model_path=args.model_path,
+            tokenizer_path=args.tokenizer_path,
+            model_name=args.model_name,
+            decoding_strategy=args.decoding_strategy,
+            mask_token_id=args.mask_token_id,
+            tensor_parallel_size=args.tensor_parallel_size,
+            data_parallel_size=args.data_parallel_size,
+            gpu_memory_utilization=args.gpu_memory_utilization,
+            max_model_len=args.max_model_len,
+            max_num_batched_tokens=getattr(args, 'max_num_batched_tokens', 4096),
+            max_num_seqs=getattr(args, 'max_num_seqs', 128),
+            use_lora=args.use_lora,
+            lora_path=args.lora_path,
+            enforce_eager=getattr(args, 'enforce_eager', False),
+            kv_cache_layout=getattr(args, 'kv_cache_layout', 'unified'),
+            accept_threshold=args.accept_threshold,
+            complete_threshold=args.complete_threshold,
+            add_new_block_threshold=args.add_new_block_threshold,
+            diffusion_block_size=args.diffusion_block_size,
+        )
+        
+        eval_config = EvalConfig(
+            dataset_name=args.dataset,
+            dataset_split=getattr(args, 'dataset_split', 'test'),
+            dataset_limit=args.dataset_limit,
+            temperature=args.temperature,
+            max_tokens=args.max_tokens,
+            ignore_eos=getattr(args, 'ignore_eos', False),
+            output_dir=args.output_dir,
+            save_results=args.save_results,
+        )
+        
+        config = BenchmarkConfig(engine=engine, eval=eval_config)
+    
+    return config
+
+
+def main():
+    """Main function"""
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    
+    # Setup logger
+    log_level = getattr(logging, args.log_level.upper())
+    setup_logger("diffulex_bench", level=log_level, log_file=args.log_file)
+    
+    # Load configuration
+    config = load_config_from_args(args)
+    
+    # Run benchmark using lm_eval
+    run_benchmark(config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/diffulex_bench/metrics.py b/diffulex_bench/metrics.py
new file mode 100644
index 0000000..88e5a49
--- /dev/null
+++ b/diffulex_bench/metrics.py
@@ -0,0 +1,126 @@
+"""
+Benchmark Metrics - Evaluation metrics computation
+"""
+
+import re
+from typing import List, Dict, Any, Optional
+import json
+
+
+def extract_number(text: str) -> Optional[float]:
+    """
+    Extract number from text (for GSM8K and other math problems)
+    
+    Args:
+        text: Input text
+        
+    Returns:
+        Extracted number, or None if not found
+    """
+    # Try to match #### number format (GSM8K standard format)
+    pattern = r'####\s*(-?\d+(?:\.\d+)?)'
+    match = re.search(pattern, text)
+    if match:
+        return float(match.group(1))
+    
+    # Try to match the last number
+    numbers = re.findall(r'-?\d+(?:\.\d+)?', text)
+    if numbers:
+        try:
+            return float(numbers[-1])
+        except ValueError:
+            pass
+    
+    return None
+
+
+def gsm8k_accuracy(
+    predictions: List[str],
+    ground_truths: List[str],
+) -> float:
+    """
+    Calculate GSM8K accuracy
+    
+    Args:
+        predictions: List of predicted texts
+        ground_truths: List of ground truth answers (including full solution process)
+        
+    Returns:
+        Accuracy (0-1)
+    """
+    if len(predictions) != len(ground_truths):
+        raise ValueError("Predictions and ground_truths must have the same length")
+    
+    correct = 0
+    for pred, gt in zip(predictions, ground_truths):
+        pred_num = extract_number(pred)
+        gt_num = extract_number(gt)
+        
+        if pred_num is not None and gt_num is not None:
+            if abs(pred_num - gt_num) < 1e-6:
+                correct += 1
+    
+    return correct / len(predictions) if predictions else 0.0
+
+
+def humaneval_pass_at_k(
+    results: List[Dict[str, Any]],
+    k: int = 1,
+) -> float:
+    """
+    Calculate HumanEval Pass@k metric
+    
+    Args:
+        results: List of results, each should contain 'output', 'test', 'entry_point' fields
+        k: k value, default 1
+        
+    Returns:
+        Pass@k score
+    """
+    # Note: Full HumanEval evaluation requires code execution, this is just a framework
+    # In practice, need to integrate code execution environment (e.g., Docker)
+    # Returns None, actual evaluation requires implementing code execution logic
+    return None
+
+
+def compute_metrics(
+    outputs: List[Dict[str, Any]],
+    ground_truths: Optional[List[str]] = None,
+    dataset_name: str = "gsm8k",
+) -> Dict[str, Any]:
+    """
+    Compute evaluation metrics
+    
+    Args:
+        outputs: List of generation results
+        ground_truths: List of ground truth answers (optional)
+        dataset_name: Dataset name, used to select appropriate evaluation method
+        
+    Returns:
+        Dictionary of metrics
+    """
+    metrics = {}
+    
+    # Basic statistics
+    total_tokens = sum(len(o.get('token_ids', [])) for o in outputs)
+    avg_diff_steps = sum(o.get('n_diff_steps', 0) for o in outputs) / len(outputs) if outputs else 0
+    total_time = sum(o.get('generation_time', 0) for o in outputs)
+    
+    metrics['num_samples'] = len(outputs)
+    metrics['total_tokens'] = total_tokens
+    metrics['avg_tokens_per_sample'] = total_tokens / len(outputs) if outputs else 0
+    metrics['avg_diff_steps'] = avg_diff_steps
+    metrics['total_time'] = total_time
+    metrics['throughput_tok_s'] = total_tokens / total_time if total_time > 0 else 0
+    
+    # Dataset-specific metrics
+    if ground_truths and dataset_name == "gsm8k":
+        predictions = [o.get('text', '') for o in outputs]
+        metrics['accuracy'] = gsm8k_accuracy(predictions, ground_truths)
+    elif ground_truths and dataset_name == "humaneval":
+        # HumanEval requires code execution, this is just a framework
+        metrics['pass_at_1'] = None  # Need to implement code execution logic
+        metrics['note'] = "HumanEval evaluation requires code execution environment"
+    
+    return metrics
+
diff --git a/diffulex_bench/report.py b/diffulex_bench/report.py
new file mode 100644
index 0000000..c4c7622
--- /dev/null
+++ b/diffulex_bench/report.py
@@ -0,0 +1,113 @@
+"""
+Benchmark Report - Report generation for benchmark results
+"""
+
+import json
+from pathlib import Path
+from typing import Dict, Any, List, Optional
+import pandas as pd
+
+
+def generate_report(results_file: str, output_file: Optional[str] = None) -> str:
+    """
+    Generate benchmark report
+    
+    Args:
+        results_file: Path to results JSON file
+        output_file: Path to output report file, if None prints to console
+        
+    Returns:
+        Report text
+    """
+    with open(results_file, 'r', encoding='utf-8') as f:
+        results = json.load(f)
+    
+    config = results['config']
+    metrics = results['metrics']
+    
+    # Generate report
+    report_lines = []
+    append_line = lambda line: report_lines.append(line)
+    append_line("=" * 80)
+    append_line("Diffulex Benchmark Report")
+    append_line("=" * 80)
+    append_line("")
+    append_line("Configuration:")
+    append_line(f"  Model: {config.get('model_path', 'N/A')}")
+    append_line(f"  Model Name: {config.get('model_name', 'N/A')}")
+    append_line(f"  Decoding Strategy: {config.get('decoding_strategy', 'N/A')}")
+    append_line(f"  Dataset: {config.get('dataset_name', 'N/A')}")
+    append_line(f"  Tensor Parallel Size: {config.get('tensor_parallel_size', 'N/A')}")
+    append_line(f"  Data Parallel Size: {config.get('data_parallel_size', 'N/A')}")
+    append_line("")
+    append_line("Metrics:")
+    append_line(f"  Number of Samples: {metrics.get('num_samples', 'N/A')}")
+    append_line(f"  Total Tokens: {metrics.get('total_tokens', 'N/A')}")
+    append_line(f"  Average Tokens per Sample: {metrics.get('avg_tokens_per_sample', 0):.2f}")
+    append_line(f"  Average Diffusion Steps: {metrics.get('avg_diff_steps', 0):.2f}")
+    append_line(f"  Total Time: {metrics.get('total_time', 0):.2f} seconds")
+    append_line(f"  Throughput: {metrics.get('throughput_tok_s', 0):.2f} tokens/s")
+    
+    if 'accuracy' in metrics and metrics['accuracy'] is not None:
+        report_lines.append(f"  Accuracy: {metrics['accuracy']:.4f}")
+    
+    report_lines.append("")
+    report_lines.append(f"Timestamp: {results.get('timestamp', 'N/A')}")
+    report_lines.append("=" * 80)
+    
+    report_text = "\n".join(report_lines)
+    
+    # Save or output
+    if output_file:
+        with open(output_file, 'w', encoding='utf-8') as f:
+            f.write(report_text)
+        print(f"Report saved to: {output_file}")
+    else:
+        print(report_text)
+    
+    return report_text
+
+
+def compare_results(result_files: List[str], output_file: Optional[str] = None) -> pd.DataFrame:
+    """
+    Compare multiple benchmark results
+    
+    Args:
+        result_files: List of result file paths
+        output_file: Path to output CSV file, if None only returns DataFrame
+        
+    Returns:
+        DataFrame with comparison results
+    """
+    rows = []
+    
+    for result_file in result_files:
+        with open(result_file, 'r', encoding='utf-8') as f:
+            results = json.load(f)
+        
+        config = results['config']
+        metrics = results['metrics']
+        
+        row = {
+            'model_path': config.get('model_path', 'N/A'),
+            'model_name': config.get('model_name', 'N/A'),
+            'decoding_strategy': config.get('decoding_strategy', 'N/A'),
+            'dataset': config.get('dataset_name', 'N/A'),
+            'num_samples': metrics.get('num_samples', 0),
+            'total_tokens': metrics.get('total_tokens', 0),
+            'avg_tokens_per_sample': metrics.get('avg_tokens_per_sample', 0),
+            'avg_diff_steps': metrics.get('avg_diff_steps', 0),
+            'throughput_tok_s': metrics.get('throughput_tok_s', 0),
+            'accuracy': metrics.get('accuracy', None),
+            'timestamp': results.get('timestamp', 'N/A'),
+        }
+        rows.append(row)
+    
+    df = pd.DataFrame(rows)
+    
+    if output_file:
+        df.to_csv(output_file, index=False, encoding='utf-8')
+        print(f"Comparison saved to: {output_file}")
+    
+    return df
+
diff --git a/diffulex_bench/runner.py b/diffulex_bench/runner.py
new file mode 100644
index 0000000..9617bc4
--- /dev/null
+++ b/diffulex_bench/runner.py
@@ -0,0 +1,193 @@
+"""
+Benchmark Runner - Benchmark runner that wraps Diffulex inference engine
+Provides a unified interface for benchmarking
+"""
+
+import time
+from typing import List, Dict, Any, Optional
+
+from diffulex import Diffulex, SamplingParams
+from transformers import AutoTokenizer
+from diffulex.logger import get_logger
+
+
+class BenchmarkRunner:
+    """
+    Benchmark runner that wraps the Diffulex inference engine
+    """
+    
+    def __init__(
+        self,
+        model_path: str,
+        tokenizer_path: Optional[str] = None,
+        wait_ready: bool = True,
+        **diffulex_kwargs
+    ):
+        """
+        Initialize the benchmark runner
+        
+        Args:
+            model_path: Path to the model
+            tokenizer_path: Path to the tokenizer, if None uses model_path
+            wait_ready: Whether to wait for engine to be fully initialized before returning
+            **diffulex_kwargs: Additional arguments to pass to Diffulex
+        """
+        self.model_path = model_path
+        self.tokenizer_path = tokenizer_path or model_path
+        self.logger = get_logger(__name__)
+        
+        # Initialize Diffulex engine
+        self.logger.info("Initializing Diffulex engine...")
+        self.llm = Diffulex(model_path, **diffulex_kwargs)
+        
+        # Wait for engine to be ready if requested
+        if wait_ready:
+            self._wait_for_ready()
+        
+        # Load tokenizer
+        self.logger.info("Loading tokenizer...")
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.tokenizer_path,
+            trust_remote_code=True
+        )
+        self.logger.success("Tokenizer loaded successfully")
+    
+    def _wait_for_ready(self, timeout: float = 300.0, check_interval: float = 0.5):
+        """
+        Wait for the Diffulex engine to be fully initialized and ready
+        
+        Args:
+            timeout: Maximum time to wait in seconds
+            check_interval: Interval between readiness checks in seconds
+        """
+        start_time = time.time()
+        
+        # Check if it's a DP worker (has _ask method) or TP worker
+        if hasattr(self.llm, '_ask'):
+            # DP worker: wait for all child processes to be ready
+            # by sending a lightweight command to each
+            dp_size = getattr(self.llm, 'dp_size', 1)
+            self.logger.info(f"[DiffulexDPWorker (DP={dp_size})]: Waiting for {dp_size} DiffulexTPWorker subprocesses to be ready...")
+            
+            while time.time() - start_time < timeout:
+                try:
+                    # Try to send a lightweight command to check readiness
+                    # Use is_finished as a lightweight check
+                    for i in range(dp_size):
+                        self.llm._ask(i, "is_finished")
+                    self.logger.success("All DiffulexTPWorker subprocesses are ready")
+                    return
+                except (EOFError, RuntimeError, AttributeError, ConnectionError) as e:
+                    # Process not ready yet, wait and retry
+                    elapsed = time.time() - start_time
+                    if elapsed < timeout:
+                        time.sleep(check_interval)
+                    else:
+                        raise RuntimeError(
+                            f"Timeout waiting for DP workers to be ready after {elapsed:.1f}s: {e}"
+                        ) from e
+        else:
+            # TP worker: wait for all subprocesses to be ready
+            # Check if subprocesses are alive and wait a bit for initialization
+            if hasattr(self.llm, 'ps') and self.llm.ps:
+                num_subprocesses = len(self.llm.ps)
+                self.logger.info(f"Waiting for {num_subprocesses} TP subprocess(es) to be ready...")
+                
+                while time.time() - start_time < timeout:
+                    # Check if all subprocesses are alive
+                    all_alive = all(p.is_alive() for p in self.llm.ps)
+                    
+                    if all_alive:
+                        # Give subprocesses a bit more time to complete initialization
+                        # The main process initialization is synchronous, but subprocesses
+                        # may still be initializing (model loading, warmup, etc.)
+                        # Subprocesses will synchronize via barrier in ModelRunnerBase.__init__
+                        # So we just need to wait a bit for them to complete initialization
+                        time.sleep(2.0)  # Wait a bit for subprocess initialization
+                        self.logger.success("All TP subprocesses are ready")
+                        return
+                    else:
+                        # Some process died, check which one
+                        dead_processes = [
+                            i for i, p in enumerate(self.llm.ps) if not p.is_alive()
+                        ]
+                        exit_codes = [
+                            self.llm.ps[i].exitcode for i in dead_processes
+                        ]
+                        raise RuntimeError(
+                            f"TP subprocess(es) {dead_processes} terminated during initialization. "
+                            f"Exit code(s): {exit_codes}"
+                        )
+                
+                elapsed = time.time() - start_time
+                raise RuntimeError(
+                    f"Timeout waiting for TP subprocesses to be ready after {elapsed:.1f}s"
+                )
+            else:
+                # Single process TP worker, should be ready immediately
+                # Main process initialization is synchronous
+                self.logger.success("TP worker is ready")
+                return
+    
+    def generate(
+        self,
+        prompts: List[str],
+        sampling_params: SamplingParams,
+        use_tqdm: bool = True,
+    ) -> List[Dict[str, Any]]:
+        """
+        Generate text
+        
+        Args:
+            prompts: List of input prompts
+            sampling_params: Sampling parameters
+            use_tqdm: Whether to show progress bar
+            
+        Returns:
+            List of generation results, each containing text, token_ids, n_diff_steps
+        """
+        start_time = time.time()
+        outputs = self.llm.generate(prompts, sampling_params, use_tqdm=use_tqdm)
+        end_time = time.time()
+        
+        # Add timing information
+        total_time = end_time - start_time
+        for output in outputs:
+            output['generation_time'] = total_time / len(outputs) if outputs else 0
+        
+        return outputs
+    
+    def evaluate_batch(
+        self,
+        prompts: List[str],
+        sampling_params: SamplingParams,
+        use_tqdm: bool = True,
+    ) -> Dict[str, Any]:
+        """
+        Evaluate a batch of prompts
+        
+        Args:
+            prompts: List of input prompts
+            sampling_params: Sampling parameters
+            use_tqdm: Whether to show progress bar
+            
+        Returns:
+            Evaluation result dictionary containing generation results and statistics
+        """
+        outputs = self.generate(prompts, sampling_params, use_tqdm=use_tqdm)
+        
+        # Calculate statistics
+        total_tokens = sum(len(o['token_ids']) for o in outputs)
+        total_time = sum(o.get('generation_time', 0) for o in outputs)
+        avg_diff_steps = sum(o.get('n_diff_steps', 0) for o in outputs) / len(outputs) if outputs else 0
+        
+        return {
+            'outputs': outputs,
+            'num_samples': len(outputs),
+            'total_tokens': total_tokens,
+            'total_time': total_time,
+            'avg_tokens_per_sample': total_tokens / len(outputs) if outputs else 0,
+            'avg_diff_steps': avg_diff_steps,
+            'throughput_tok_s': total_tokens / total_time if total_time > 0 else 0,
+        }
+
diff --git a/diffulex_profiler/README.md b/diffulex_profiler/README.md
new file mode 100644
index 0000000..3fa25a7
--- /dev/null
+++ b/diffulex_profiler/README.md
@@ -0,0 +1,327 @@
+# Diffulex Profiler
+
+A modular profiling framework for performance analysis of the Diffulex inference engine. This module provides comprehensive performance metrics collection, multiple profiling backends, and flexible result export capabilities.
+
+## Features
+
+- **Multiple Profiling Backends**: Support for simple timing, VizTracer, and PyTorch Profiler
+- **Comprehensive Metrics**: Collect timing, throughput, GPU utilization, memory usage, and custom metrics
+- **Flexible Export**: Export results in JSON, CSV, or human-readable summary formats
+- **Easy Integration**: Simple context manager API for seamless integration with existing code
+- **Modular Design**: Extensible architecture for adding custom backends and exporters
+
+## Installation
+
+The profiler is included as part of the Diffulex package. No additional installation is required beyond the standard Diffulex dependencies.
+
+Optional dependencies for advanced features:
+- `viztracer`: For detailed function call tracing (already in dependencies)
+- `pynvml`: For detailed GPU utilization metrics (optional)
+
+## Quick Start
+
+### Basic Usage
+
+```python
+from diffulex_profiler import DiffulexProfiler, ProfilerConfig
+from diffulex import Diffulex, SamplingParams
+
+# Initialize profiler
+profiler = DiffulexProfiler(
+    config=ProfilerConfig(
+        enabled=True,
+        backend="simple",
+        output_dir="log/profiles"
+    )
+)
+
+# Initialize Diffulex engine
+llm = Diffulex(model_path, model_name="dream", ...)
+
+# Profile inference
+with profiler.profile("inference", metadata={"batch_size": 10}):
+    outputs = llm.generate(prompts, sampling_params)
+    total_tokens = sum(len(o['token_ids']) for o in outputs)
+    profiler.record_throughput(total_tokens)
+
+# Export results
+profiler.export("log/profiles/inference_profile.json")
+```
+
+### Advanced Usage with Multiple Sections
+
+```python
+profiler = DiffulexProfiler(
+    config=ProfilerConfig(
+        enabled=True,
+        backend="simple",
+        collect_gpu_metrics=True,
+        collect_memory_metrics=True,
+        export_formats=["json", "csv", "summary"]
+    )
+)
+
+# Profile different sections
+with profiler.profile("model_loading"):
+    llm = Diffulex(model_path, ...)
+
+with profiler.profile("prefill", metadata={"num_prompts": len(prompts)}):
+    # Prefill phase
+    pass
+
+with profiler.profile("decode"):
+    outputs = llm.generate(prompts, sampling_params)
+    profiler.record_throughput(sum(len(o['token_ids']) for o in outputs))
+
+# Get summary
+summary = profiler.get_summary()
+print(f"Total duration: {summary['total_duration_sec']:.2f}s")
+print(f"Average throughput: {summary['avg_throughput_tokens_per_sec']:.2f} tok/s")
+
+# Export all results
+profiler.export()
+```
+
+## Configuration
+
+### ProfilerConfig
+
+The `ProfilerConfig` class provides comprehensive configuration options:
+
+```python
+@dataclass
+class ProfilerConfig:
+    enabled: bool = True                    # Enable/disable profiling
+    backend: str = "simple"                 # Backend: "simple", "viztracer", "pytorch"
+    output_dir: str = "log/profiles"        # Output directory for results
+    output_file: Optional[str] = None       # Optional custom output filename
+    collect_gpu_metrics: bool = True        # Collect GPU metrics
+    collect_memory_metrics: bool = True     # Collect memory metrics
+    collect_timing: bool = True             # Collect timing information
+    export_formats: List[str] = ["json", "summary"]  # Export formats
+    viztracer_config: Optional[Dict] = None # VizTracer-specific config
+    pytorch_profiler_config: Optional[Dict] = None  # PyTorch Profiler config
+```
+
+## Profiling Backends
+
+### Simple Timer Backend (Default)
+
+The simplest backend that only tracks execution time. No additional dependencies required.
+
+```python
+profiler = DiffulexProfiler(
+    config=ProfilerConfig(backend="simple")
+)
+```
+
+### VizTracer Backend
+
+For detailed function call tracing and visualization. Requires `viztracer` package.
+
+```python
+profiler = DiffulexProfiler(
+    config=ProfilerConfig(
+        backend="viztracer",
+        viztracer_config={
+            "output_file": "trace.json",
+            "file_info": True,
+        }
+    )
+)
+```
+
+### PyTorch Profiler Backend
+
+For GPU/CPU operation-level profiling. Built into PyTorch.
+
+```python
+profiler = DiffulexProfiler(
+    config=ProfilerConfig(
+        backend="pytorch",
+        pytorch_profiler_config={
+            "activities": [ProfilerActivity.CPU, ProfilerActivity.CUDA],
+            "record_shapes": True,
+            "profile_memory": True,
+        }
+    )
+)
+```
+
+## Metrics Collection
+
+The profiler automatically collects:
+
+- **Timing**: Start time, end time, duration
+- **Throughput**: Tokens per second (when recorded via `record_throughput()`)
+- **GPU Metrics**: Utilization, memory usage, device information
+- **Memory Metrics**: System memory usage and deltas
+- **Custom Metrics**: User-defined metrics via `record_metric()`
+
+### Recording Custom Metrics
+
+```python
+with profiler.profile("custom_section"):
+    # Your code here
+    profiler.record_metric("num_sequences", 10)
+    profiler.record_metric("avg_length", 128.5)
+    profiler.record_throughput(total_tokens=1000)
+```
+
+## Export Formats
+
+### JSON Export
+
+Structured JSON format suitable for programmatic analysis:
+
+```python
+profiler = DiffulexProfiler(
+    config=ProfilerConfig(export_formats=["json"])
+)
+profiler.export("results.json")
+```
+
+### CSV Export
+
+Tabular format for spreadsheet analysis:
+
+```python
+profiler = DiffulexProfiler(
+    config=ProfilerConfig(export_formats=["csv"])
+)
+profiler.export("results.csv")
+```
+
+### Summary Export
+
+Human-readable text summary:
+
+```python
+profiler = DiffulexProfiler(
+    config=ProfilerConfig(export_formats=["summary"])
+)
+profiler.export("results.txt")
+```
+
+## Integration Examples
+
+### Integration with Diffulex Engine
+
+```python
+from diffulex_profiler import DiffulexProfiler, ProfilerConfig
+from diffulex import Diffulex, SamplingParams
+
+# Setup
+profiler = DiffulexProfiler(ProfilerConfig(enabled=True))
+llm = Diffulex(model_path, model_name="dream", ...)
+sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
+
+# Profile generation
+prompts = ["What is 2+2?", "Explain quantum computing"]
+with profiler.profile("generate", metadata={"num_prompts": len(prompts)}):
+    outputs = llm.generate(prompts, sampling_params)
+    total_tokens = sum(len(o['token_ids']) for o in outputs)
+    profiler.record_throughput(total_tokens)
+    profiler.record_metric("num_outputs", len(outputs))
+    profiler.record_metric("avg_diff_steps", 
+                          sum(o['n_diff_steps'] for o in outputs) / len(outputs))
+
+# Export
+profiler.export("generation_profile.json")
+summary = profiler.get_summary()
+print(f"Throughput: {summary['avg_throughput_tokens_per_sec']:.2f} tok/s")
+```
+
+### Batch Profiling
+
+```python
+profiler = DiffulexProfiler(ProfilerConfig(enabled=True))
+
+for batch_idx, batch in enumerate(batches):
+    with profiler.profile(f"batch_{batch_idx}", metadata={"batch_size": len(batch)}):
+        outputs = llm.generate(batch, sampling_params)
+        profiler.record_throughput(sum(len(o['token_ids']) for o in outputs))
+
+profiler.export("batch_profiles.json")
+```
+
+## API Reference
+
+### DiffulexProfiler
+
+Main profiler class.
+
+#### Methods
+
+- `profile(name: str, metadata: Optional[Dict] = None)`: Context manager for profiling
+- `start(name: str, metadata: Optional[Dict] = None)`: Start profiling a section
+- `stop()`: Stop profiling current section
+- `record_metric(name: str, value: Any)`: Record a custom metric
+- `record_throughput(tokens: int, duration: Optional[float] = None)`: Record throughput
+- `export(output_path: Optional[str] = None)`: Export results
+- `get_summary() -> Dict[str, Any]`: Get summary statistics
+- `clear()`: Clear all collected metrics
+
+### PerformanceMetrics
+
+Container for performance metrics.
+
+#### Attributes
+
+- `name`: Section name
+- `duration`: Duration in seconds
+- `total_tokens`: Total tokens processed
+- `throughput_tokens_per_sec`: Throughput in tokens/second
+- `gpu_utilization`: GPU utilization percentage
+- `memory_delta_mb`: Memory usage delta in MB
+- `custom_metrics`: Dictionary of custom metrics
+- `metadata`: User-provided metadata
+
+## Best Practices
+
+1. **Use Context Managers**: Always use the `profile()` context manager for automatic cleanup
+2. **Record Throughput**: Call `record_throughput()` after inference to get accurate throughput metrics
+3. **Add Metadata**: Include relevant metadata (batch size, model config, etc.) for better analysis
+4. **Choose Appropriate Backend**: Use "simple" for basic timing, "viztracer" for detailed tracing, "pytorch" for GPU profiling
+5. **Export Regularly**: Export results periodically for long-running experiments
+6. **Clear When Needed**: Use `clear()` to reset metrics between different profiling sessions
+
+## Troubleshooting
+
+### Profiler Not Collecting Metrics
+
+- Ensure `enabled=True` in `ProfilerConfig`
+- Check that you're using the context manager correctly
+- Verify that `start()` and `stop()` are called in pairs
+
+### GPU Metrics Not Available
+
+- Ensure CUDA is available: `torch.cuda.is_available()`
+- Install `pynvml` for detailed GPU utilization: `pip install pynvml`
+
+### Backend Import Errors
+
+- Simple backend is always available
+- VizTracer backend requires: `pip install viztracer`
+- PyTorch Profiler is built into PyTorch
+
+## Contributing
+
+To add a new profiling backend:
+
+1. Create a new class inheriting from `ProfilerBackend`
+2. Implement `start()` and `stop()` methods
+3. Add it to `backends/__init__.py`
+4. Update `DiffulexProfiler._init_backend()` to support it
+
+To add a new exporter:
+
+1. Create a new class inheriting from `ProfilerExporter`
+2. Implement `export()` method
+3. Add it to `exporters/__init__.py`
+4. Update `DiffulexProfiler._init_exporters()` to support it
+
+## License
+
+Same as the main Diffulex project.
+
diff --git a/diffulex_profiler/__init__.py b/diffulex_profiler/__init__.py
new file mode 100644
index 0000000..67c812a
--- /dev/null
+++ b/diffulex_profiler/__init__.py
@@ -0,0 +1,41 @@
+"""
+Diffulex Profiler - Modular profiling framework for performance analysis of Diffulex inference engine
+"""
+
+from diffulex_profiler.profiler import DiffulexProfiler, ProfilerConfig
+from diffulex_profiler.metrics import (
+    PerformanceMetrics,
+    collect_gpu_metrics,
+    collect_cpu_metrics,
+    collect_memory_metrics,
+)
+from diffulex_profiler.backends import (
+    ProfilerBackend,
+    SimpleTimerBackend,
+    VizTracerBackend,
+    PyTorchProfilerBackend,
+)
+from diffulex_profiler.exporters import (
+    ProfilerExporter,
+    JSONExporter,
+    CSVExporter,
+    SummaryExporter,
+)
+
+__all__ = [
+    "DiffulexProfiler",
+    "ProfilerConfig",
+    "PerformanceMetrics",
+    "collect_gpu_metrics",
+    "collect_cpu_metrics",
+    "collect_memory_metrics",
+    "ProfilerBackend",
+    "SimpleTimerBackend",
+    "VizTracerBackend",
+    "PyTorchProfilerBackend",
+    "ProfilerExporter",
+    "JSONExporter",
+    "CSVExporter",
+    "SummaryExporter",
+]
+
diff --git a/diffulex_profiler/backends/__init__.py b/diffulex_profiler/backends/__init__.py
new file mode 100644
index 0000000..65bdb2c
--- /dev/null
+++ b/diffulex_profiler/backends/__init__.py
@@ -0,0 +1,24 @@
+"""
+Profiling backends for different profiling tools.
+"""
+from diffulex_profiler.backends.base import ProfilerBackend
+from diffulex_profiler.backends.simple import SimpleTimerBackend
+
+__all__ = [
+    "ProfilerBackend",
+    "SimpleTimerBackend",
+]
+
+# Optional backends
+try:
+    from diffulex_profiler.backends.viztracer import VizTracerBackend
+    __all__.append("VizTracerBackend")
+except ImportError:
+    pass
+
+try:
+    from diffulex_profiler.backends.pytorch import PyTorchProfilerBackend
+    __all__.append("PyTorchProfilerBackend")
+except ImportError:
+    pass
+
diff --git a/diffulex_profiler/backends/base.py b/diffulex_profiler/backends/base.py
new file mode 100644
index 0000000..ed77513
--- /dev/null
+++ b/diffulex_profiler/backends/base.py
@@ -0,0 +1,30 @@
+"""
+Base class for profiling backends.
+"""
+from abc import ABC, abstractmethod
+from typing import Optional, Dict, Any
+
+
+class ProfilerBackend(ABC):
+    """Abstract base class for profiling backends."""
+    
+    @abstractmethod
+    def start(self, name: str) -> None:
+        """Start profiling a section."""
+        pass
+    
+    @abstractmethod
+    def stop(self) -> Optional[Dict[str, Any]]:
+        """Stop profiling and return collected data."""
+        pass
+    
+    @abstractmethod
+    def __enter__(self):
+        """Context manager entry."""
+        pass
+    
+    @abstractmethod
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit."""
+        pass
+
diff --git a/diffulex_profiler/backends/pytorch.py b/diffulex_profiler/backends/pytorch.py
new file mode 100644
index 0000000..4f5e068
--- /dev/null
+++ b/diffulex_profiler/backends/pytorch.py
@@ -0,0 +1,100 @@
+"""
+PyTorch Profiler backend.
+"""
+from typing import Optional, Dict, Any
+from pathlib import Path
+
+try:
+    import torch
+    from torch.profiler import profile, record_function, ProfilerActivity
+    PYTORCH_PROFILER_AVAILABLE = True
+except ImportError:
+    PYTORCH_PROFILER_AVAILABLE = False
+    profile = None
+    record_function = None
+    ProfilerActivity = None
+
+from diffulex_profiler.backends.base import ProfilerBackend
+from diffulex.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class PyTorchProfilerBackend(ProfilerBackend):
+    """PyTorch Profiler-based backend for GPU/CPU operation profiling."""
+    
+    def __init__(self, output_dir: Optional[str] = None, activities: Optional[list] = None, **kwargs):
+        if not PYTORCH_PROFILER_AVAILABLE:
+            raise ImportError("PyTorch Profiler is not available")
+        
+        self.output_dir = Path(output_dir) if output_dir else Path("log/profiles")
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        
+        if activities is None:
+            activities = [ProfilerActivity.CPU]
+            if torch.cuda.is_available():
+                activities.append(ProfilerActivity.CUDA)
+        
+        self.activities = activities
+        self.config = kwargs
+        self.profiler: Optional[profile] = None
+        self.current_name: Optional[str] = None
+    
+    def start(self, name: str) -> None:
+        """Start PyTorch Profiler."""
+        if self.profiler is not None:
+            logger.warning("PyTorch Profiler already started, stopping previous instance")
+            self.stop()
+        
+        self.current_name = name
+        self.profiler = profile(
+            activities=self.activities,
+            record_shapes=True,
+            profile_memory=True,
+            with_stack=True,
+            **self.config
+        )
+        self.profiler.__enter__()
+    
+    def stop(self) -> Optional[Dict[str, Any]]:
+        """Stop PyTorch Profiler and export trace."""
+        if self.profiler is None:
+            return None
+        
+        self.profiler.__exit__(None, None, None)
+        
+        trace_file = self.output_dir / f"pytorch_trace_{self.current_name}.json"
+        try:
+            self.profiler.export_chrome_trace(str(trace_file))
+        except Exception as e:
+            logger.warning(f"Failed to export PyTorch trace: {e}")
+            trace_file = None
+        
+        result = {
+            "backend": "pytorch",
+            "trace_file": str(trace_file) if trace_file else None,
+            "name": self.current_name,
+        }
+        
+        try:
+            events = self.profiler.key_averages()
+            result["summary"] = {
+                "total_events": len(events),
+                "cpu_time_total_ms": sum(e.cpu_time_total_us for e in events) / 1000,
+                "cuda_time_total_ms": sum(e.cuda_time_total_us for e in events) / 1000 if torch.cuda.is_available() else 0,
+            }
+        except Exception as e:
+            logger.warning(f"Failed to get profiler summary: {e}")
+        
+        self.profiler = None
+        self.current_name = None
+        
+        return result
+    
+    def __enter__(self):
+        return self
+    
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.profiler is not None:
+            self.stop()
+
diff --git a/diffulex_profiler/backends/simple.py b/diffulex_profiler/backends/simple.py
new file mode 100644
index 0000000..c4128f2
--- /dev/null
+++ b/diffulex_profiler/backends/simple.py
@@ -0,0 +1,44 @@
+"""
+Simple timer-based profiling backend.
+"""
+import time
+from typing import Optional, Dict, Any
+
+from diffulex_profiler.backends.base import ProfilerBackend
+
+
+class SimpleTimerBackend(ProfilerBackend):
+    """Simple timer-based profiling backend that only tracks time."""
+    
+    def __init__(self):
+        self.start_time: Optional[float] = None
+        self.current_name: Optional[str] = None
+    
+    def start(self, name: str) -> None:
+        """Start timing."""
+        self.current_name = name
+        self.start_time = time.perf_counter()
+    
+    def stop(self) -> Optional[Dict[str, Any]]:
+        """Stop timing and return duration."""
+        if self.start_time is None:
+            return None
+        
+        duration = time.perf_counter() - self.start_time
+        result = {
+            "duration_sec": duration,
+            "name": self.current_name,
+        }
+        
+        self.start_time = None
+        self.current_name = None
+        
+        return result
+    
+    def __enter__(self):
+        return self
+    
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.start_time is not None:
+            self.stop()
+
diff --git a/diffulex_profiler/backends/viztracer.py b/diffulex_profiler/backends/viztracer.py
new file mode 100644
index 0000000..22cf38e
--- /dev/null
+++ b/diffulex_profiler/backends/viztracer.py
@@ -0,0 +1,75 @@
+"""
+VizTracer profiling backend.
+"""
+from typing import Optional, Dict, Any
+from pathlib import Path
+
+try:
+    from viztracer import VizTracer
+    VIZTRACER_AVAILABLE = True
+except ImportError:
+    VIZTRACER_AVAILABLE = False
+    VizTracer = None
+
+from diffulex_profiler.backends.base import ProfilerBackend
+from diffulex.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class VizTracerBackend(ProfilerBackend):
+    """VizTracer-based profiling backend for detailed function call tracing."""
+    
+    def __init__(self, output_file: Optional[str] = None, output_dir: Optional[str] = None, **kwargs):
+        if not VIZTRACER_AVAILABLE:
+            raise ImportError("VizTracer is not installed. Install it with: pip install viztracer")
+        
+        self.output_file = output_file
+        self.output_dir = output_dir
+        self.tracer: Optional[VizTracer] = None
+        self.config = kwargs
+    
+    def start(self, name: str) -> None:
+        """Start VizTracer."""
+        if self.tracer is not None:
+            logger.warning("VizTracer already started, stopping previous instance")
+            self.stop()
+        
+        if self.output_file:
+            output_file = self.output_file
+        else:
+            output_file = f"viztracer_{name}.json"
+        
+        # If output_dir is specified, prepend it to the output_file path
+        if self.output_dir:
+            output_file = str(Path(self.output_dir) / Path(output_file).name)
+            # Ensure output directory exists
+            Path(self.output_dir).mkdir(parents=True, exist_ok=True)
+        
+        logger.info(f"VizTracer output file: {output_file}")
+        self.tracer = VizTracer(output_file=output_file, **self.config)
+        self.tracer.start()
+    
+    def stop(self) -> Optional[Dict[str, Any]]:
+        """Stop VizTracer and return trace file path."""
+        if self.tracer is None:
+            return None
+        
+        self.tracer.stop()
+        output_file = self.tracer.output_file
+        
+        result = {
+            "backend": "viztracer",
+            "output_file": str(output_file),
+        }
+        
+        self.tracer = None
+        return result
+    
+    def __enter__(self):
+        return self
+    
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.tracer is not None:
+            self.stop()
+
diff --git a/diffulex_profiler/example.py b/diffulex_profiler/example.py
new file mode 100644
index 0000000..8982990
--- /dev/null
+++ b/diffulex_profiler/example.py
@@ -0,0 +1,132 @@
+"""
+Example usage of Diffulex Profiler.
+
+This example demonstrates how to use the profiler to collect performance metrics
+during Diffulex inference.
+"""
+from diffulex_profiler import DiffulexProfiler, ProfilerConfig
+from diffulex import Diffulex, SamplingParams
+
+
+def example_basic_usage():
+    """Basic profiling example."""
+    # Initialize profiler
+    profiler = DiffulexProfiler(
+        config=ProfilerConfig(
+            enabled=True,
+            backend="simple",
+            output_dir="log/profiles",
+            collect_gpu_metrics=True,
+            collect_memory_metrics=True,
+        )
+    )
+    
+    # Initialize Diffulex engine
+    model_path = "/path/to/your/model"
+    llm = Diffulex(
+        model_path,
+        model_name="dream",
+        tensor_parallel_size=1,
+        data_parallel_size=1,
+        gpu_memory_utilization=0.25,
+        max_model_len=2048,
+        decoding_strategy="d2f",
+    )
+    
+    # Prepare prompts
+    prompts = ["What is 2+2?", "Explain quantum computing"]
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
+    
+    # Profile inference
+    with profiler.profile("inference", metadata={"num_prompts": len(prompts)}):
+        outputs = llm.generate(prompts, sampling_params)
+        total_tokens = sum(len(o['token_ids']) for o in outputs)
+        profiler.record_throughput(total_tokens)
+        profiler.record_metric("num_outputs", len(outputs))
+        profiler.record_metric("avg_diff_steps", 
+                              sum(o['n_diff_steps'] for o in outputs) / len(outputs))
+    
+    # Export results
+    profiler.export("inference_profile.json")
+    
+    # Get summary
+    summary = profiler.get_summary()
+    print(f"Total duration: {summary['total_duration_sec']:.2f}s")
+    print(f"Average throughput: {summary['avg_throughput_tokens_per_sec']:.2f} tok/s")
+
+
+def example_multiple_sections():
+    """Example with multiple profiling sections."""
+    profiler = DiffulexProfiler(
+        config=ProfilerConfig(
+            enabled=True,
+            backend="simple",
+            export_formats=["json", "csv", "summary"]
+        )
+    )
+    
+    # Profile model loading
+    with profiler.profile("model_loading"):
+        llm = Diffulex(model_path, model_name="dream", ...)
+    
+    # Profile prefill
+    prompts = ["Prompt 1", "Prompt 2"]
+    with profiler.profile("prefill", metadata={"num_prompts": len(prompts)}):
+        # Prefill operations
+        pass
+    
+    # Profile decode
+    with profiler.profile("decode"):
+        outputs = llm.generate(prompts, SamplingParams())
+        profiler.record_throughput(sum(len(o['token_ids']) for o in outputs))
+    
+    # Export all results
+    profiler.export("multi_section_profile.json")
+
+
+def example_viztracer_backend():
+    """Example using VizTracer backend for detailed tracing."""
+    profiler = DiffulexProfiler(
+        config=ProfilerConfig(
+            enabled=True,
+            backend="viztracer",
+            viztracer_config={
+                "output_file": "trace.json",
+                "file_info": True,
+            }
+        )
+    )
+    
+    with profiler.profile("detailed_trace"):
+        # Your code here
+        pass
+    
+    profiler.export()
+
+
+def example_pytorch_profiler():
+    """Example using PyTorch Profiler for GPU/CPU profiling."""
+    from torch.profiler import ProfilerActivity
+    
+    profiler = DiffulexProfiler(
+        config=ProfilerConfig(
+            enabled=True,
+            backend="pytorch",
+            pytorch_profiler_config={
+                "activities": [ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                "record_shapes": True,
+                "profile_memory": True,
+            }
+        )
+    )
+    
+    with profiler.profile("gpu_profiling"):
+        # Your code here
+        pass
+    
+    profiler.export()
+
+
+if __name__ == "__main__":
+    example_basic_usage()
+
diff --git a/diffulex_profiler/exporters/__init__.py b/diffulex_profiler/exporters/__init__.py
new file mode 100644
index 0000000..a0019f4
--- /dev/null
+++ b/diffulex_profiler/exporters/__init__.py
@@ -0,0 +1,19 @@
+"""
+Exporters for profiling results.
+"""
+from diffulex_profiler.exporters.base import ProfilerExporter
+from diffulex_profiler.exporters.json import JSONExporter
+from diffulex_profiler.exporters.summary import SummaryExporter
+
+__all__ = [
+    "ProfilerExporter",
+    "JSONExporter",
+    "SummaryExporter",
+]
+
+try:
+    from diffulex_profiler.exporters.csv import CSVExporter
+    __all__.append("CSVExporter")
+except ImportError:
+    pass
+
diff --git a/diffulex_profiler/exporters/base.py b/diffulex_profiler/exporters/base.py
new file mode 100644
index 0000000..07badad
--- /dev/null
+++ b/diffulex_profiler/exporters/base.py
@@ -0,0 +1,24 @@
+"""
+Base class for profiler exporters.
+"""
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import List
+
+from diffulex_profiler.metrics import PerformanceMetrics
+
+
+class ProfilerExporter(ABC):
+    """Abstract base class for exporting profiling results."""
+    
+    @abstractmethod
+    def export(self, metrics: List[PerformanceMetrics], output_path: Path) -> None:
+        """
+        Export metrics to a file.
+        
+        Args:
+            metrics: List of performance metrics to export
+            output_path: Base path for output (exporter may add extension)
+        """
+        pass
+
diff --git a/diffulex_profiler/exporters/csv.py b/diffulex_profiler/exporters/csv.py
new file mode 100644
index 0000000..ee26767
--- /dev/null
+++ b/diffulex_profiler/exporters/csv.py
@@ -0,0 +1,46 @@
+"""
+CSV exporter for profiling results.
+"""
+import csv
+from pathlib import Path
+from typing import List
+
+from diffulex_profiler.exporters.base import ProfilerExporter
+from diffulex_profiler.metrics import PerformanceMetrics
+
+
+class CSVExporter(ProfilerExporter):
+    """Export profiling results to CSV format."""
+    
+    def export(self, metrics: List[PerformanceMetrics], output_path: Path) -> None:
+        """Export metrics to CSV file."""
+        output_file = output_path.with_suffix(".csv")
+        
+        if not metrics:
+            return
+        
+        fieldnames = set(["name", "duration_sec", "total_tokens", "throughput_tokens_per_sec"])
+        
+        for m in metrics:
+            fieldnames.update(m.custom_metrics.keys())
+            if m.metadata:
+                fieldnames.update(f"metadata_{k}" for k in m.metadata.keys())
+        
+        fieldnames = sorted(list(fieldnames))
+        
+        with open(output_file, "w", newline="") as f:
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writeheader()
+            
+            for m in metrics:
+                row = {
+                    "name": m.name,
+                    "duration_sec": m.duration,
+                    "total_tokens": m.total_tokens,
+                    "throughput_tokens_per_sec": m.throughput_tokens_per_sec,
+                }
+                row.update(m.custom_metrics)
+                for k, v in m.metadata.items():
+                    row[f"metadata_{k}"] = v
+                writer.writerow(row)
+
diff --git a/diffulex_profiler/exporters/json.py b/diffulex_profiler/exporters/json.py
new file mode 100644
index 0000000..19fc641
--- /dev/null
+++ b/diffulex_profiler/exporters/json.py
@@ -0,0 +1,43 @@
+"""
+JSON exporter for profiling results.
+"""
+import json
+from pathlib import Path
+from typing import List
+
+from diffulex_profiler.exporters.base import ProfilerExporter
+from diffulex_profiler.metrics import PerformanceMetrics
+
+
+class JSONExporter(ProfilerExporter):
+    """Export profiling results to JSON format."""
+    
+    def export(self, metrics: List[PerformanceMetrics], output_path: Path) -> None:
+        """Export metrics to JSON file."""
+        output_file = output_path.with_suffix(".json")
+        
+        data = {
+            "metrics": [m.to_dict() for m in metrics],
+            "summary": self._compute_summary(metrics),
+        }
+        
+        with open(output_file, "w") as f:
+            json.dump(data, f, indent=2)
+    
+    def _compute_summary(self, metrics: List[PerformanceMetrics]) -> dict:
+        """Compute summary statistics."""
+        if not metrics:
+            return {}
+        
+        total_duration = sum(m.duration for m in metrics if m.duration)
+        total_tokens = sum(m.total_tokens for m in metrics if m.total_tokens)
+        
+        return {
+            "total_sections": len(metrics),
+            "total_duration_sec": total_duration,
+            "total_tokens": total_tokens,
+            "avg_throughput_tokens_per_sec": (
+                total_tokens / total_duration if total_duration > 0 else 0
+            ),
+        }
+
diff --git a/diffulex_profiler/exporters/summary.py b/diffulex_profiler/exporters/summary.py
new file mode 100644
index 0000000..2b44d4e
--- /dev/null
+++ b/diffulex_profiler/exporters/summary.py
@@ -0,0 +1,68 @@
+"""
+Summary exporter for profiling results (human-readable text output).
+"""
+from pathlib import Path
+from typing import List
+
+from diffulex_profiler.exporters.base import ProfilerExporter
+from diffulex_profiler.metrics import PerformanceMetrics
+from diffulex.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class SummaryExporter(ProfilerExporter):
+    """Export profiling results as a human-readable summary."""
+    
+    def export(self, metrics: List[PerformanceMetrics], output_path: Path) -> None:
+        """Export metrics as a text summary."""
+        output_file = output_path.with_suffix(".txt")
+        
+        summary_lines = []
+        summary_lines.append("=" * 80)
+        summary_lines.append("Diffulex Profiling Summary")
+        summary_lines.append("=" * 80)
+        summary_lines.append("")
+        
+        total_duration = sum(m.duration for m in metrics if m.duration)
+        total_tokens = sum(m.total_tokens for m in metrics if m.total_tokens)
+        avg_throughput = (
+            total_tokens / total_duration if total_duration > 0 and total_tokens > 0 else 0
+        )
+        
+        summary_lines.append(f"Total Sections: {len(metrics)}")
+        summary_lines.append(f"Total Duration: {total_duration:.2f} seconds")
+        summary_lines.append(f"Total Tokens: {total_tokens}")
+        summary_lines.append(f"Average Throughput: {avg_throughput:.2f} tokens/sec")
+        summary_lines.append("")
+        
+        summary_lines.append("-" * 80)
+        summary_lines.append("Section Details:")
+        summary_lines.append("-" * 80)
+        
+        for m in metrics:
+            summary_lines.append(f"\nSection: {m.name}")
+            summary_lines.append(f"  Duration: {m.duration:.4f} seconds")
+            if m.total_tokens > 0:
+                summary_lines.append(f"  Tokens: {m.total_tokens}")
+                summary_lines.append(f"  Throughput: {m.throughput_tokens_per_sec:.2f} tokens/sec")
+            if m.gpu_utilization != 0:
+                summary_lines.append(f"  GPU Utilization: {m.gpu_utilization:.2f}%")
+            if m.memory_delta_mb != 0:
+                summary_lines.append(f"  Memory Delta: {m.memory_delta_mb:.2f} MB")
+            if m.custom_metrics:
+                summary_lines.append(f"  Custom Metrics: {m.custom_metrics}")
+            if m.metadata:
+                summary_lines.append(f"  Metadata: {m.metadata}")
+            if m.backend_data and m.backend_data.get("backend") == "viztracer":
+                output_file = m.backend_data.get("output_file", "N/A")
+                summary_lines.append(f"  VizTracer Output: {output_file}")
+        
+        summary_lines.append("")
+        summary_lines.append("=" * 80)
+        
+        with open(output_file, "w") as f:
+            f.write("\n".join(summary_lines))
+        
+        logger.info("\n".join(summary_lines))
+
diff --git a/diffulex_profiler/metrics.py b/diffulex_profiler/metrics.py
new file mode 100644
index 0000000..f3678ed
--- /dev/null
+++ b/diffulex_profiler/metrics.py
@@ -0,0 +1,113 @@
+"""
+Performance metrics collection and data structures.
+"""
+from __future__ import annotations
+
+import time
+from dataclasses import dataclass, field
+from typing import Dict, Any, Optional
+
+import torch
+
+try:
+    import psutil
+    PSUTIL_AVAILABLE = True
+except ImportError:
+    PSUTIL_AVAILABLE = False
+
+
+@dataclass
+class PerformanceMetrics:
+    """Container for performance metrics collected during profiling."""
+    name: str
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    start_time: float = 0.0
+    end_time: float = 0.0
+    duration: float = 0.0
+    total_tokens: int = 0
+    throughput_tokens_per_sec: float = 0.0
+    gpu_metrics_start: Optional[Dict[str, Any]] = None
+    gpu_metrics_end: Optional[Dict[str, Any]] = None
+    gpu_utilization: float = 0.0
+    memory_metrics_start: Optional[Dict[str, Any]] = None
+    memory_metrics_end: Optional[Dict[str, Any]] = None
+    memory_delta_mb: float = 0.0
+    custom_metrics: Dict[str, Any] = field(default_factory=dict)
+    backend_data: Optional[Dict[str, Any]] = None
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert metrics to dictionary for serialization."""
+        return {
+            "name": self.name,
+            "metadata": self.metadata,
+            "duration_sec": self.duration,
+            "total_tokens": self.total_tokens,
+            "throughput_tokens_per_sec": self.throughput_tokens_per_sec,
+            "gpu_utilization": self.gpu_utilization,
+            "memory_delta_mb": self.memory_delta_mb,
+            "custom_metrics": self.custom_metrics,
+            "backend_data": self.backend_data,
+        }
+
+
+def collect_gpu_metrics() -> Dict[str, Any]:
+    """Collect current GPU metrics."""
+    if not torch.cuda.is_available():
+        return {}
+    
+    metrics = {}
+    try:
+        device = torch.cuda.current_device()
+        metrics["device"] = device
+        metrics["device_name"] = torch.cuda.get_device_name(device)
+        
+        memory_stats = torch.cuda.memory_stats(device)
+        metrics["allocated_mb"] = memory_stats.get("allocated_bytes.all.current", 0) / (1024 ** 2)
+        metrics["reserved_mb"] = memory_stats.get("reserved_bytes.all.current", 0) / (1024 ** 2)
+        metrics["peak_allocated_mb"] = memory_stats.get("allocated_bytes.all.peak", 0) / (1024 ** 2)
+        
+        try:
+            import pynvml
+            pynvml.nvmlInit()
+            handle = pynvml.nvmlDeviceGetHandleByIndex(device)
+            util = pynvml.nvmlDeviceGetUtilizationRates(handle)
+            metrics["utilization"] = util.gpu
+            metrics["memory_utilization"] = util.memory
+        except (ImportError, Exception):
+            pass
+        
+    except Exception:
+        pass
+    
+    return metrics
+
+
+def collect_cpu_metrics() -> Dict[str, Any]:
+    """Collect current CPU metrics."""
+    if not PSUTIL_AVAILABLE:
+        return {}
+    try:
+        return {
+            "cpu_percent": psutil.cpu_percent(interval=0.1),
+            "cpu_count": psutil.cpu_count(),
+            "load_avg": psutil.getloadavg() if hasattr(psutil, "getloadavg") else None,
+        }
+    except Exception:
+        return {}
+
+
+def collect_memory_metrics() -> Dict[str, Any]:
+    """Collect current memory metrics."""
+    if not PSUTIL_AVAILABLE:
+        return {}
+    try:
+        mem = psutil.virtual_memory()
+        return {
+            "total_mb": mem.total / (1024 ** 2),
+            "available_mb": mem.available / (1024 ** 2),
+            "used_mb": mem.used / (1024 ** 2),
+            "percent": mem.percent,
+        }
+    except Exception:
+        return {}
+
diff --git a/diffulex_profiler/profiler.py b/diffulex_profiler/profiler.py
new file mode 100644
index 0000000..8f3f20d
--- /dev/null
+++ b/diffulex_profiler/profiler.py
@@ -0,0 +1,258 @@
+"""
+Core profiler implementation for Diffulex.
+"""
+from __future__ import annotations
+
+import time
+from dataclasses import dataclass, field
+from typing import Any, Optional, Dict, List
+from contextlib import contextmanager
+from pathlib import Path
+
+import torch
+
+from diffulex_profiler.metrics import PerformanceMetrics, collect_gpu_metrics, collect_memory_metrics
+from diffulex_profiler.backends import ProfilerBackend, SimpleTimerBackend
+from diffulex_profiler.exporters import ProfilerExporter, JSONExporter, SummaryExporter
+from diffulex.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@dataclass
+class ProfilerConfig:
+    """Configuration for the profiler."""
+    enabled: bool = True
+    backend: str = "simple"  # "simple", "viztracer", "pytorch"
+    output_dir: str = "log/profiles"
+    output_file: Optional[str] = None
+    collect_gpu_metrics: bool = True
+    collect_memory_metrics: bool = True
+    collect_timing: bool = True
+    export_formats: List[str] = field(default_factory=lambda: ["json", "summary"])
+    viztracer_config: Optional[Dict[str, Any]] = None
+    pytorch_profiler_config: Optional[Dict[str, Any]] = None
+
+
+class DiffulexProfiler:
+    """
+    Main profiler class for collecting performance metrics during Diffulex inference.
+    
+    Example:
+        >>> profiler = DiffulexProfiler(config=ProfilerConfig(enabled=True))
+        >>> with profiler.profile("inference"):
+        ...     outputs = llm.generate(prompts, sampling_params)
+        >>> profiler.export("log/profiles/result.json")
+    """
+    
+    def __init__(self, config: Optional[ProfilerConfig] = None):
+        self.config = config or ProfilerConfig()
+        self.metrics: List[PerformanceMetrics] = []
+        self.current_metrics: Optional[PerformanceMetrics] = None
+        self.backend: Optional[ProfilerBackend] = None
+        self.exporters: List[ProfilerExporter] = []
+        
+        if not self.config.enabled:
+            return
+            
+        self._init_backend()
+        self._init_exporters()
+        Path(self.config.output_dir).mkdir(parents=True, exist_ok=True)
+    
+    def _init_backend(self):
+        """Initialize the profiling backend."""
+        if self.config.backend == "simple":
+            self.backend = SimpleTimerBackend()
+        elif self.config.backend == "viztracer":
+            try:
+                from diffulex_profiler.backends import VizTracerBackend
+                viztracer_config = self.config.viztracer_config or {}
+                # Pass output_dir to VizTracerBackend so it can save files in the correct location
+                if "output_dir" not in viztracer_config:
+                    viztracer_config["output_dir"] = self.config.output_dir
+                self.backend = VizTracerBackend(**viztracer_config)
+            except ImportError:
+                logger.warning("VizTracer not available, falling back to simple timer")
+                self.backend = SimpleTimerBackend()
+        elif self.config.backend == "pytorch":
+            try:
+                from diffulex_profiler.backends import PyTorchProfilerBackend
+                pytorch_config = self.config.pytorch_profiler_config or {}
+                self.backend = PyTorchProfilerBackend(**pytorch_config)
+            except ImportError:
+                logger.warning("PyTorch Profiler not available, falling back to simple timer")
+                self.backend = SimpleTimerBackend()
+        else:
+            logger.warning(f"Unknown backend '{self.config.backend}', using simple timer")
+            self.backend = SimpleTimerBackend()
+    
+    def _init_exporters(self):
+        """Initialize exporters based on config."""
+        for fmt in self.config.export_formats:
+            if fmt == "json":
+                self.exporters.append(JSONExporter())
+            elif fmt == "csv":
+                from diffulex_profiler.exporters import CSVExporter
+                self.exporters.append(CSVExporter())
+            elif fmt == "summary":
+                self.exporters.append(SummaryExporter())
+            else:
+                logger.warning(f"Unknown export format '{fmt}', skipping")
+    
+    @contextmanager
+    def profile(self, name: str, metadata: Optional[Dict[str, Any]] = None):
+        """
+        Context manager for profiling a code block.
+        
+        Args:
+            name: Name of the profiling section
+            metadata: Optional metadata to attach to the metrics
+            
+        Example:
+            >>> with profiler.profile("model_forward", {"batch_size": 32}):
+            ...     output = model(input_ids)
+        """
+        if not self.config.enabled:
+            yield
+            return
+        
+        self.start(name, metadata)
+        try:
+            yield
+        finally:
+            self.stop()
+    
+    def start(self, name: str, metadata: Optional[Dict[str, Any]] = None):
+        """Start profiling a section."""
+        if not self.config.enabled:
+            return
+        
+        self.current_metrics = PerformanceMetrics(
+            name=name,
+            metadata=metadata or {},
+        )
+        
+        if self.config.collect_timing:
+            self.current_metrics.start_time = time.perf_counter()
+        
+        if self.backend:
+            self.backend.start(name)
+        
+        if self.config.collect_gpu_metrics and torch.cuda.is_available():
+            self.current_metrics.gpu_metrics_start = collect_gpu_metrics()
+        
+        if self.config.collect_memory_metrics:
+            self.current_metrics.memory_metrics_start = collect_memory_metrics()
+    
+    def stop(self):
+        """Stop profiling the current section."""
+        if not self.config.enabled or self.current_metrics is None:
+            return
+        
+        if self.config.collect_timing:
+            self.current_metrics.end_time = time.perf_counter()
+            self.current_metrics.duration = (
+                self.current_metrics.end_time - self.current_metrics.start_time
+            )
+        
+        if self.backend:
+            backend_data = self.backend.stop()
+            if backend_data:
+                self.current_metrics.backend_data = backend_data
+        
+        if self.config.collect_gpu_metrics and torch.cuda.is_available():
+            self.current_metrics.gpu_metrics_end = collect_gpu_metrics()
+            if self.current_metrics.gpu_metrics_start and self.current_metrics.gpu_metrics_end:
+                self.current_metrics.gpu_utilization = (
+                    self.current_metrics.gpu_metrics_end.get("utilization", 0) -
+                    self.current_metrics.gpu_metrics_start.get("utilization", 0)
+                )
+        
+        if self.config.collect_memory_metrics:
+            self.current_metrics.memory_metrics_end = collect_memory_metrics()
+            if (self.current_metrics.memory_metrics_start and 
+                self.current_metrics.memory_metrics_end):
+                start_mem = self.current_metrics.memory_metrics_start.get("used_mb", 0)
+                end_mem = self.current_metrics.memory_metrics_end.get("used_mb", 0)
+                self.current_metrics.memory_delta_mb = end_mem - start_mem
+        
+        self.metrics.append(self.current_metrics)
+        self.current_metrics = None
+    
+    def record_metric(self, name: str, value: Any):
+        """Record a custom metric."""
+        if not self.config.enabled or self.current_metrics is None:
+            return
+        self.current_metrics.custom_metrics[name] = value
+    
+    def record_throughput(self, tokens: int, duration: Optional[float] = None):
+        """Record throughput in tokens per second."""
+        if not self.config.enabled or self.current_metrics is None:
+            return
+        if duration is None:
+            duration = self.current_metrics.duration
+        if duration and duration > 0:
+            self.current_metrics.throughput_tokens_per_sec = tokens / duration
+            self.current_metrics.total_tokens = tokens
+    
+    def export(self, output_path: Optional[str] = None):
+        """
+        Export profiling results using configured exporters.
+        
+        Args:
+            output_path: Optional custom output path. If not provided, uses config output_file
+                         or generates one based on timestamp.
+        """
+        if not self.config.enabled or not self.metrics:
+            logger.warning("No metrics to export")
+            return
+        
+        if output_path is None:
+            if self.config.output_file:
+                output_path = str(Path(self.config.output_dir) / self.config.output_file)
+            else:
+                timestamp = time.strftime("%Y%m%d_%H%M%S")
+                output_path = str(Path(self.config.output_dir) / f"profile_{timestamp}")
+        
+        output_path = Path(output_path)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        for exporter in self.exporters:
+            try:
+                exporter.export(self.metrics, output_path)
+            except Exception as e:
+                logger.error(f"Failed to export using {exporter.__class__.__name__}: {e}")
+    
+    def get_summary(self) -> Dict[str, Any]:
+        """Get a summary of all collected metrics."""
+        if not self.metrics:
+            return {}
+        
+        total_duration = sum(m.duration for m in self.metrics if m.duration)
+        total_tokens = sum(m.total_tokens for m in self.metrics if m.total_tokens)
+        avg_throughput = (
+            total_tokens / total_duration 
+            if total_duration > 0 and total_tokens > 0 
+            else 0
+        )
+        
+        return {
+            "total_sections": len(self.metrics),
+            "total_duration_sec": total_duration,
+            "total_tokens": total_tokens,
+            "avg_throughput_tokens_per_sec": avg_throughput,
+            "sections": [
+                {
+                    "name": m.name,
+                    "duration_sec": m.duration,
+                    "throughput_tokens_per_sec": m.throughput_tokens_per_sec,
+                    "total_tokens": m.total_tokens,
+                }
+                for m in self.metrics
+            ],
+        }
+    
+    def clear(self):
+        """Clear all collected metrics."""
+        self.metrics.clear()
+        self.current_metrics = None
\ No newline at end of file
diff --git a/docs/make.bat b/docs/make.bat
index 2034948..51d3652 100644
--- a/docs/make.bat
+++ b/docs/make.bat
@@ -1,35 +1,35 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
-    set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=.
-set BUILDDIR=_build
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-    echo.
-    echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-    echo.installed, then set the SPHINXBUILD environment variable to point
-    echo.to the full path of the 'sphinx-build' executable. Alternatively you
-    echo.may add the Sphinx directory to PATH.
-    echo.
-    echo.If you don't have Sphinx installed, grab it from
-    echo.https://www.sphinx-doc.org/
-    exit /b 1
-)
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+    set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+    echo.
+    echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+    echo.installed, then set the SPHINXBUILD environment variable to point
+    echo.to the full path of the 'sphinx-build' executable. Alternatively you
+    echo.may add the Sphinx directory to PATH.
+    echo.
+    echo.If you don't have Sphinx installed, grab it from
+    echo.https://www.sphinx-doc.org/
+    exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/examples/test_dream_diffulex_gsm8k.py b/examples/test_dream_diffulex_gsm8k.py
index 6605627..de3a2aa 100755
--- a/examples/test_dream_diffulex_gsm8k.py
+++ b/examples/test_dream_diffulex_gsm8k.py
@@ -10,30 +10,6 @@
 from transformers import AutoTokenizer
 
 from diffulex import Diffulex, SamplingParams
-
-
-def summarize_profiling(csv_path: str) -> dict:
-    totals = {}
-    total_nums = {}
-    avgs = {}
-    with open(csv_path, 'r', newline='') as f:
-        reader = csv.dictReader(f)
-        for row in reader:
-            for k, v in row.items():
-                try:
-                    val = float(v)
-                except ValueError:
-                    continue
-                if val != 0.0:
-                    total_nums[k] = total_nums.get(k, 0) + 1
-                totals[k] = totals.get(k, 0.0) + val
-    print(pd.DataFrame([totals]).T)
-    for k, v in totals.items():
-        if k in total_nums and total_nums[k] > 0:
-            avgs[k] = v / total_nums[k]
-        else:
-            avgs[k] = 0.0
-    print(pd.DataFrame([avgs]).T)
     
 
 FEW_SHOTS="""
@@ -49,7 +25,7 @@ def summarize_profiling(csv_path: str) -> dict:
         use_lora=True,
         model_name="dream", 
         enforce_eager=True, 
-        data_parallel_size=1,
+        data_parallel_size=8,
         tensor_parallel_size=1,
         gpu_memory_utilization=0.25,
         max_num_batched_tokens=2048,
diff --git a/examples/test_sdar_diffulex_gsm8k.py b/examples/test_sdar_diffulex_gsm8k.py
index b0fc8d5..b4f360c 100755
--- a/examples/test_sdar_diffulex_gsm8k.py
+++ b/examples/test_sdar_diffulex_gsm8k.py
@@ -12,34 +12,11 @@
 from diffulex import Diffulex, SamplingParams
 
 
-def summarize_profiling(csv_path: str) -> dict:
-    totals = {}
-    total_nums = {}
-    avgs = {}
-    with open(csv_path, 'r', newline='') as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            for k, v in row.items():
-                try:
-                    val = float(v)
-                except ValueError:
-                    continue
-                if val != 0.0:
-                    total_nums[k] = total_nums.get(k, 0) + 1
-                totals[k] = totals.get(k, 0.0) + val
-    print(pd.DataFrame([totals]).T)
-    for k, v in totals.items():
-        if k in total_nums and total_nums[k] > 0:
-            avgs[k] = v / total_nums[k]
-        else:
-            avgs[k] = 0.0
-    print(pd.DataFrame([avgs]).T)
-
 FEW_SHOTS = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nQuestion: Jen and Tyler are gymnasts practicing flips. Jen is practicing the triple-flip while Tyler is practicing the double-flip. Jen did sixteen triple-flips during practice. Tyler flipped in the air half the number of times Jen did. How many double-flips did Tyler do?\nAnswer:<|im_end|>\n<|im_start|>assistant\nJen did 16 triple-flips, so she did 16 * 3 = <<16*3=48>>48 flips.\nTyler did half the number of flips, so he did 48 / 2 = <<48/2=24>>24 flips.\nA double flip has two flips, so Tyler did 24 / 2 = <<24/2=12>>12 double-flips.\n#### 12<|im_end|>\n<|im_start|>user\nQuestion: Four people in a law firm are planning a party. Mary will buy a platter of pasta for $20 and a loaf of bread for $2. Elle and Andrea will split the cost for buying 4 cans of soda which cost $1.50 each, and chicken wings for $10. Joe will buy a cake that costs $5. How much more will Mary spend than the rest of the firm put together?\nAnswer:<|im_end|>\n<|im_start|>assistant\nMary will spend $20 + $2 = $<<20+2=22>>22.\nElle and Andrea will spend $1.5 x 4 = $<<1.5*4=6>>6 for the soda.\nElle and Andrea will spend $6 + $10 = $<<6+10=16>>16 for the soda and chicken wings.\nElle, Andrea, and Joe together will spend $16 + $5 = $<<16+5=21>>21.\nSo, Mary will spend $22 - $21 = $<<22-21=1>>1 more than all of them combined.\n#### 1<|im_end|>\n<|im_start|>user\nQuestion: A charcoal grill burns fifteen coals to ash every twenty minutes of grilling. The grill ran for long enough to burn three bags of coals. Each bag of coal contains 60 coals. How long did the grill run?\nAnswer:<|im_end|>\n<|im_start|>assistant\nThe grill burned 3 * 60 = <<3*60=180>>180 coals.\nIt takes 20 minutes to burn 15 coals, so the grill ran for 180 / 15 * 20 = <<180/15*20=240>>240 minutes.\n#### 240<|im_end|>\n<|im_start|>user\nQuestion: A bear is preparing to hibernate for the winter and needs to gain 1000 pounds. At the end of summer, the bear feasts on berries and small woodland animals. During autumn, it devours acorns and salmon. It gained a fifth of the weight it needed from berries during summer, and during autumn, it gained twice that amount from acorns. Salmon made up half of the remaining weight it had needed to gain. How many pounds did it gain eating small animals?\nAnswer:<|im_end|>\n<|im_start|>assistant\nThe bear gained 1 / 5 * 1000 = <<1/5*1000=200>>200 pounds from berries.\nIt gained 2 * 200 = <<2*200=400>>400 pounds from acorns.\nIt still needed 1000 - 200 - 400 = <<1000-200-400=400>>400 pounds.\nThus, it gained 400 / 2 = <<400/2=200>>200 pounds from salmon.\nTherefore, the bear gained 400 - 200 = <<400-200=200>>200 pounds from small animals.\n#### 200<|im_end|>\n<|im_start|>user\nQuestion: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\nAnswer:<|im_end|>\n<|im_start|>assistant\n"
 # FEW_SHOTS = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
 
 if __name__ == "__main__":
-    PROFILE = False
+    PROFILE = True
     # model = "/root/data/ckpts/JetLM/SDAR-1.7B-Chat-b32"
     model = "/data1/ckpts/JetLM/SDAR-1.7B-Chat-b32"
     dataset = load_dataset("gsm8k", "main", split="test")["question"][:1]
diff --git a/format.sh b/format.sh
new file mode 100755
index 0000000..3cc4390
--- /dev/null
+++ b/format.sh
@@ -0,0 +1,183 @@
+#!/usr/bin/env bash
+# Usage:
+#    # Do work and commit your work.
+#
+#    # Format files that differ from origin/main.
+#    bash format.sh
+#
+#    # Format all files.
+#    bash format.sh --all
+#
+#
+# Ruff (format) + Clang formatter (if installed). This script formats all changed files from the last mergebase.
+# You are encouraged to run this locally before pushing changes for review.
+
+# Cause the script to exit if a single command fails
+set -eo pipefail
+
+if [[ -z "${BASH_VERSION}" ]]; then
+    echo "Please run this script using bash." >&2
+    exit 1
+fi
+
+# this stops git rev-parse from failing if we run this from the .git directory
+builtin cd "$(dirname "${BASH_SOURCE:-$0}")"
+ROOT="$(git rev-parse --show-toplevel)"
+builtin cd "$ROOT" || exit 1
+
+ALL_FILES=''
+ONLY_CHANGED=''
+FILES=()
+if (($# == 0)); then
+    # Default: allow dirty workspace; run on changed files (committed + worktree)
+    ONLY_CHANGED='true'
+else
+    while (($# > 0)); do
+        case "$1" in
+        --files)
+            shift
+            while (($# > 0)); do
+                FILES+=("$1")
+                shift
+            done
+            ;;
+        --all)
+            ALL_FILES='true'
+            shift
+            ;;
+        *)
+            echo "Unknown argument: '$1'" >&2
+            exit 1
+            ;;
+        esac
+    done
+fi
+
+MERGE_BASE=""
+get_merge_base() {
+    UPSTREAM_REPO="https://github.com/tile-ai/tilelang"
+    if git ls-remote --exit-code "${UPSTREAM_REPO}" main &>/dev/null; then
+        # First try to use the upstream repository directly
+        MERGE_BASE="$(git fetch "${UPSTREAM_REPO}" main &>/dev/null && git merge-base FETCH_HEAD HEAD)"
+    elif git show-ref --verify --quiet refs/remotes/origin/main; then
+        # Fall back to origin/main if available
+        BASE_BRANCH="origin/main"
+        MERGE_BASE="$(git merge-base "${BASE_BRANCH}" HEAD)"
+    else
+        # Last resort, use local main
+        BASE_BRANCH="main"
+        MERGE_BASE="$(git merge-base "${BASE_BRANCH}" HEAD)"
+    fi
+    echo "${MERGE_BASE}"
+}
+
+if [[ -n "${ALL_FILES}" ]]; then
+    echo "Checking all files..." >&2
+elif [[ -n "${ONLY_CHANGED}" ]]; then
+    MERGE_BASE="$(get_merge_base)"
+    echo "Checking changed files vs merge base (${MERGE_BASE}) and working tree..." >&2
+elif [[ "${#FILES[@]}" -gt 0 ]]; then
+    echo "Checking specified files: ${FILES[*]}..." >&2
+fi
+
+# Some systems set pip's default to --user, which breaks isolated virtualenvs.
+export PIP_USER=0
+
+# If pre-commit is not installed, install it.
+if ! python3 -m pre_commit --version &>/dev/null; then
+    python3 -m pip install pre-commit --user
+fi
+
+echo 'tile-lang pre-commit: Check Start'
+
+if [[ -n "${ALL_FILES}" ]]; then
+    python3 -m pre_commit run --all-files
+elif [[ -n "${ONLY_CHANGED}" ]]; then
+    # Collect changed files (committed since merge-base + current worktree)
+    CHANGED_FILES="$(git diff --name-only --diff-filter=ACM "${MERGE_BASE}" 2>/dev/null || true)"
+    if [[ -n "${CHANGED_FILES}" ]]; then
+        echo "Running pre-commit on changed files:"
+        echo "${CHANGED_FILES}"
+        # Convert newline-separated files to space-separated and run pre-commit once
+        CHANGED_FILES_SPACE="$(echo "${CHANGED_FILES}" | tr '\n' ' ')"
+        python3 -m pre_commit run --files ${CHANGED_FILES_SPACE}
+    else
+        echo "No files changed relative to merge base and worktree. Skipping pre-commit."
+    fi
+elif [[ "${#FILES[@]}" -gt 0 ]]; then
+    python3 -m pre_commit run --files "${FILES[@]}"
+fi
+
+echo 'tile-lang pre-commit: Done'
+
+echo 'tile-lang clang-tidy: Check Start'
+# If clang-tidy is available, run it; otherwise, skip
+if [[ -x "$(command -v run-clang-tidy)" ]]; then
+    # Check if clang-tidy is available
+    if [[ ! -x "$(command -v clang-tidy)" ]]; then
+        python3 -m pip install --upgrade --requirements "${ROOT}/requirements-lint.txt" --user
+    fi
+    # Get clang-tidy version
+    CLANG_TIDY_VERSION="$(clang-tidy --version | head -n1 | awk '{print $4}')"
+    echo "Using clang-tidy version: ${CLANG_TIDY_VERSION}"
+
+    # Check if build directory exists
+    if [[ ! -d "${ROOT}/build" ]]; then
+        echo "Build directory not found. Skipping clang-tidy checks."
+    else
+        # Run clang-tidy on specified files
+        clang_tidy_files() {
+            run-clang-tidy -j 64 "$@" -p build
+        }
+
+        # Run clang-tidy on all C/C++ source files
+        clang_tidy_all() {
+            run-clang-tidy -j 64 src/*.cc -p build
+        }
+
+        # Run clang-tidy on changed C/C++ files relative to main
+        clang_tidy_changed() {
+            # Get changed C/C++ files
+            CHANGED_FILES="$(git diff --name-only --diff-filter=ACM "${MERGE_BASE}" -- '*.c' '*.cc' '*.cpp' '*.h' '*.hpp' 2>/dev/null || true)"
+
+            if [[ -n "${CHANGED_FILES}" ]]; then
+                echo "Running clang-tidy on changed files:"
+                echo "${CHANGED_FILES}"
+                # Convert newline-separated files to space-separated and run clang-tidy once
+                CHANGED_FILES_SPACE="$(echo "${CHANGED_FILES}" | tr '\n' ' ')"
+                run-clang-tidy -j 64 ${CHANGED_FILES_SPACE} -p build -fix
+            else
+                echo "No C/C++ files changed. Skipping clang-tidy."
+            fi
+        }
+
+        if [[ -n "${ALL_FILES}" ]]; then
+            # If --all is given, run clang-tidy on all source files
+            clang_tidy_all
+        elif [[ -n "${ONLY_CHANGED}" ]]; then
+            # Otherwise, run clang-tidy only on changed C/C++ files
+            clang_tidy_changed
+        elif [[ "${#FILES[@]}" -gt 0 ]]; then
+            # If --files is given, run clang-tidy only on the provided files
+            clang_tidy_files "${FILES[@]}"
+        fi
+    fi
+
+else
+    echo "run-clang-tidy not found. Skipping clang-tidy checks."
+    echo "To install clang-tidy tools, you may need to install clang-tidy and run-clang-tidy."
+fi
+echo 'tile-lang clang-tidy: Done'
+
+# Check if there are any uncommitted changes after all formatting steps.
+# If there are, ask the user to review and stage them.
+if ! git diff --quiet &>/dev/null; then
+    echo 'Reformatted files. Please review and stage the changes.'
+    echo 'Changes not staged for commit:'
+    echo
+    git --no-pager diff --name-only
+
+    exit 1
+fi
+
+echo 'tile-lang: All checks passed'
diff --git a/profile/d2f_dream_profile.py b/profile/d2f_dream_profile.py
new file mode 100644
index 0000000..750fe4f
--- /dev/null
+++ b/profile/d2f_dream_profile.py
@@ -0,0 +1,87 @@
+"""
+D2F Dream Model Profiling Example
+
+This example demonstrates how to profile the performance
+of Dream model with D2F decoding strategy using nsys.
+"""
+import os
+import time
+from pathlib import Path
+from diffulex import Diffulex, SamplingParams
+from transformers import AutoTokenizer
+
+
+def main():
+    model_path = "/data1/ckpts/Dream-org/Dream-v0-Base-7B"
+    lora_path = "/data1/ckpts/SJTU-Deng-Lab/D2F_Dream_Base_7B_Lora"
+    
+    output_dir = Path("log/profiles")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    print("Loading model...")
+    model_load_start = time.time()
+    llm = Diffulex(
+        model_path,
+        lora_path=lora_path,
+        use_lora=True,
+        model_name="dream",
+        enforce_eager=True,
+        tensor_parallel_size=1,
+        data_parallel_size=1,
+        gpu_memory_utilization=0.25,
+        max_model_len=2048,
+        decoding_strategy="d2f",
+        mask_token_id=151666,
+        diffusion_block_size=32,
+        accept_threshold=0.95,
+        complete_threshold=0.9,
+        add_new_block_threshold=0.1,
+    )
+    model_load_time = time.time() - model_load_start
+    print(f"Model loaded in {model_load_time:.2f} seconds")
+    
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
+    
+    prompts = [
+        "What is 2+2?",
+        "Explain quantum computing in simple terms.",
+        "Write a Python function to calculate factorial.",
+    ]
+    
+    print(f"\nStarting inference profiling...")
+    
+    inference_start = time.time()
+    outputs = llm.generate(prompts, sampling_params)
+    inference_time = time.time() - inference_start
+    
+    total_tokens = sum(len(o.get('token_ids', [])) for o in outputs)
+    num_outputs = len(outputs)
+    avg_diff_steps = sum(o.get('n_diff_steps', 0) for o in outputs) / num_outputs if outputs else 0
+    throughput = total_tokens / inference_time if inference_time > 0 else 0
+    
+    print("\n" + "=" * 80)
+    print("Profiling Summary")
+    print("=" * 80)
+    print(f"Model Loading Time: {model_load_time:.2f} seconds")
+    print(f"Inference Time: {inference_time:.2f} seconds")
+    print(f"Total Duration: {model_load_time + inference_time:.2f} seconds")
+    print(f"\nInference Metrics:")
+    print(f"  Number of Prompts: {num_outputs}")
+    print(f"  Total Tokens: {total_tokens}")
+    print(f"  Average Throughput: {throughput:.2f} tokens/sec")
+    print(f"  Average Diffusion Steps: {avg_diff_steps:.2f}")
+    print("=" * 80)
+    
+    print("\nGenerated Output Preview:")
+    for idx, output in enumerate(outputs):
+        print(f"\n[Prompt {idx + 1}]")
+        print(f"Input: {prompts[idx]}")
+        print(f"Output: {output.get('text', 'N/A')[:200]}...")
+        print(f"Token Count: {len(output.get('token_ids', []))}")
+        if 'n_diff_steps' in output:
+            print(f"Diffusion Steps: {output['n_diff_steps']}")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index ebc9aa3..30a6222 100755
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,6 +32,11 @@ dependencies = [
     "matplotlib>=3.10.5",
     "fastapi>=0.115.0",
     "uvicorn>=0.30.0",
+    "pandas>=2.3.3",
+    "tilelang>=0.1.7.post1",
+    "rich>=13.0.0",
+    "colorama>=0.4.6",
+    "lm-eval"
 ]
 
 [project.urls]
@@ -39,14 +44,16 @@ Homepage = "https://github.com/zhijie-group/D2fEngine"
 Repository = "https://zhijie-group.github.io/D2fEngine"
 "Organization" = "https://github.com/zhijie-group"
 
+[[tool.uv.index]]
+url = "https://mirrors.aliyun.com/pypi/simple"
+default = true
+
 [tool.setuptools.packages.find]
 include = [
     "diffulex",
+    "diffulex_bench",
     "diffulex_kernel",
     "diffulex_legacy",
+    "diffulex_profiler",
     "test"
 ]
-
-[[tool.uv.index]]
-url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
-default = true
\ No newline at end of file
diff --git a/script/d2f_dream_eval_gsm8k.sh b/script/d2f_dream_eval_gsm8k.sh
new file mode 100755
index 0000000..7cece76
--- /dev/null
+++ b/script/d2f_dream_eval_gsm8k.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/zsh
+
+export HF_HUB_OFFLINE=1
+export HF_DATASETS_OFFLINE=1
+export HF_EVALUATE_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+export WANDB_DISABLED=true
+
+export HF_HOME="$(pwd)/cache"
+export HF_DATASETS_CACHE="$HF_HOME/datasets"
+export HF_METRICS_CACHE="$HF_HOME/metrics"
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+python -m diffulex_bench.main \
+    --config custom_configs/d2f_dream_eval_gsm8k.yml \
+    2>&1 | tee log/d2f_dream_eval_gsm8k.log
\ No newline at end of file