diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
index c2c587da9fe..b4a1880ca69 100644
--- a/backends/aoti/aoti_backend.py
+++ b/backends/aoti/aoti_backend.py
@@ -19,8 +19,10 @@
 from executorch.exir._warnings import experimental
 from executorch.exir.backend.backend_details import ExportedProgram, PreprocessResult
 from executorch.exir.backend.compile_spec_schema import CompileSpec
+from executorch.exir.program._program import _update_exported_program_graph_module
 from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu
 from torch.export.passes import move_to_device_pass
+from torch.fx.passes.infra.pass_base import PassResult
 
 
 class COMPILE_SPEC_KEYS(Enum):
@@ -156,7 +158,40 @@ def preprocess(
         # Apply custom backend-specific passes
         custom_passes = cls.get_custom_passes(compile_specs)
         for custom_pass in custom_passes:
-            custom_pass(device_edge_program.graph_module)
+            result = custom_pass(device_edge_program.graph_module)
+            # Handle passes that return PassResult with a new graph_module
+            if isinstance(result, PassResult) and result.modified:
+                # Use a permissive verifier that allows all operator types including
+                # edge ops and custom triton ops. The default verifier only allows
+                # torch._ops.OpOverload and HigherOrderOperator, but edge dialect
+                # uses EdgeOpOverload, triton ops may use OpOverloadPacket or CustomOpDef.
+                from executorch.exir.dialects.edge._ops import EdgeOpOverload
+                from torch._export.verifier import Verifier
+                from torch._library.custom_ops import CustomOpDef
+                from torch._ops import (
+                    HigherOrderOperator,
+                    OpOverload,
+                    OpOverloadPacket,
+                )
+
+                class _PermissiveVerifier(Verifier):
+                    dialect = "EDGE"
+
+                    def allowed_op_types(self):
+                        return (
+                            OpOverload,
+                            OpOverloadPacket,
+                            HigherOrderOperator,
+                            EdgeOpOverload,
+                            CustomOpDef,
+                        )
+
+                    def check_valid_op(self, op):
+                        pass  # Allow all ops
+
+                device_edge_program = _update_exported_program_graph_module(
+                    device_edge_program, result.graph_module, override_verifiers=[_PermissiveVerifier]
+                )
 
         # Run decompositions if any
         if decomposition_table:
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index f0d3a000ec0..f5501e1fad5 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -10,10 +10,12 @@
 
 import torch
 from executorch.backends.aoti.aoti_backend import AotiBackend
+from executorch.backends.cuda.passes import FuseInt4WeightOnlyQuantMatmulPass
 from executorch.backends.cuda.triton.replacement_pass import (
     ReplaceEdgeOpWithTritonOpPass,
 )
 from executorch.exir._warnings import experimental
+from torch.fx.passes.dialect.common.cse_pass import CSEPass
 from executorch.exir.backend.backend_details import BackendDetails
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from torch._inductor.decomposition import conv1d_to_conv2d
@@ -50,12 +52,26 @@ def get_decomposition_table(cls) -> Dict[Any, Any]:
     @classmethod
     def get_custom_passes(cls, compile_specs: List[CompileSpec]) -> List[typing.Any]:
         """
-        Return CUDA-specific passes: ReplaceEdgeOpWithTritonOpPass.
+        Return CUDA-specific passes.
+
+        Passes include:
+        - CSEPass: Common subexpression elimination to merge duplicate preprocessing chains
+        - FuseInt4WeightOnlyQuantMatmulPass: Fuses INT4 matmul ops sharing the same input
+        - ReplaceEdgeOpWithTritonOpPass: Replaces edge ops with Triton kernels (optional)
 
         The Triton kernel replacement behavior can be controlled via compile_specs:
         - triton_kernel_mode="ON": Always use Triton kernels
         - triton_kernel_mode="OFF": Never use Triton kernels and fallback to other implementations like cuda or decomposed operator.
         """
+        # Start with CSE pass to merge duplicate preprocessing chains
+        # This enables Int4 fusion to group operations with identical preprocessing
+        passes: List[typing.Any] = [CSEPass()]
+
+        # Fuse INT4 weight-only quantized matmul operations
+        # Reduces kernel launch overhead by fusing Q/K/V and Gate/Up projections
+        # REQUIRES: CSE pass to have run first (to merge preprocessing)
+        passes.append(FuseInt4WeightOnlyQuantMatmulPass())
+
         # Parse compile_specs for triton_kernel_mode
         triton_kernel_mode = "ON"  # Default mode
         for spec in compile_specs:
@@ -68,7 +84,10 @@ def get_custom_passes(cls, compile_specs: List[CompileSpec]) -> List[typing.Any]
                     )
                 triton_kernel_mode = mode
 
-        return [ReplaceEdgeOpWithTritonOpPass()] if triton_kernel_mode == "ON" else []
+        if triton_kernel_mode == "ON":
+            passes.append(ReplaceEdgeOpWithTritonOpPass())
+
+        return passes
 
     @classmethod
     def get_aoti_compile_options(
diff --git a/backends/cuda/passes/__init__.py b/backends/cuda/passes/__init__.py
new file mode 100644
index 00000000000..b54f5398c99
--- /dev/null
+++ b/backends/cuda/passes/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""CUDA backend optimization passes."""
+
+from .fuse_int4_quant_matmul import FuseInt4WeightOnlyQuantMatmulPass
+
+__all__ = ["FuseInt4WeightOnlyQuantMatmulPass"]
diff --git a/backends/cuda/passes/fuse_int4_quant_matmul.py b/backends/cuda/passes/fuse_int4_quant_matmul.py
new file mode 100644
index 00000000000..598a4acb941
--- /dev/null
+++ b/backends/cuda/passes/fuse_int4_quant_matmul.py
@@ -0,0 +1,433 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+INT4 Weight-Only Quantized Matmul Fusion Pass
+
+This pass fuses multiple int4pack_mm operations that share the same input tensor
+into a single fused operation, reducing kernel launch overhead.
+
+ALGORITHM:
+
+The fusion transforms:
+    input → int4mm(input, W_q, block_size, S_q) → Q
+    input → int4mm(input, W_k, block_size, S_k) → K
+    input → int4mm(input, W_v, block_size, S_v) → V
+
+Into:
+    fused_W = cat([W_q, W_k, W_v], dim=0)
+    fused_S = cat([S_q, S_k, S_v], dim=1)
+    fused_output = int4mm(input, fused_W, block_size, fused_S)
+    [Q, K, V] = split(fused_output, dim=-1)
+
+CORRECTNESS:
+
+This transformation is mathematically valid due to matrix multiplication's
+distributive property over concatenation:
+
+    X @ [W_1 | W_2 | W_3] = [X @ W_1 | X @ W_2 | X @ W_3]
+
+Where [A | B] denotes horizontal concatenation along the output dimension.
+
+For INT4 quantized operations:
+    int4mm(X, W, bs, S) computes: X @ dequantize(W, S, bs)
+
+Therefore:
+    int4mm(X, cat([W_1, W_2, W_3]), bs, cat([S_1, S_2, S_3]))
+    = cat([int4mm(X, W_1, bs, S_1), int4mm(X, W_2, bs, S_2), int4mm(X, W_3, bs, S_3)])
+
+PREREQUISITES:
+
+This pass requires Common Subexpression Elimination (CSE) to run first:
+- CSE merges duplicate preprocessing chains (reshape, cast, pad, etc.)
+- After CSE, operations with identical preprocessing share the same input node
+- This allows simple grouping by checking node.args[0] equality
+
+EXAMPLES:
+
+1. Attention QKV projection:     3 int4mm ops → 1 fused op
+2. MLP Gate/Up projection:        2 int4mm ops → 1 fused op
+3. Multi-head attention (8 heads): 8 int4mm ops → 3 fused ops (max_fusion_size=3)
+"""
+
+import operator
+from collections import defaultdict
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from executorch.exir.pass_base import ExportPass
+from torch.fx import GraphModule, Node
+from torch.fx.passes.infra.pass_base import PassResult
+
+
+class FuseInt4WeightOnlyQuantMatmulPass(ExportPass):
+    """
+    Fuses INT4 weight-only quantized matmul operations sharing the same input.
+
+    This pass identifies groups of aten._weight_int4pack_mm operations that:
+    1. Share the same input tensor (after CSE preprocessing)
+    2. Have the same block_size parameter
+    3. Have compatible dtypes and devices
+
+    For each group, it:
+    1. Concatenates weights and scales
+    2. Creates a single fused int4mm operation
+    3. Splits the output back to individual results
+
+    Args:
+        min_fusion_size: Minimum number of operations to fuse (default: 2)
+        max_fusion_size: Maximum operations per fused group (default: 3)
+    """
+
+    def __init__(self, min_fusion_size: int = 2, max_fusion_size: int = 3):
+        super().__init__()
+        self.min_fusion_size = min_fusion_size
+        self.max_fusion_size = max_fusion_size
+
+    def call(self, graph_module: GraphModule) -> PassResult:
+        """Apply fusion pass to the graph."""
+        groups = self._find_fuseable_groups(graph_module)
+        fusion_results = [self._fuse_group(graph_module, g) for g in groups]
+        modified = any(fusion_results)
+
+        if modified:
+            graph_module.graph.lint()
+            graph_module.graph.eliminate_dead_code()
+            graph_module.recompile()
+            try:
+                graph_module = super().call(graph_module).graph_module
+            except Exception:
+                # super().call() may fail on mock graphs without proper metadata
+                pass
+
+        return PassResult(graph_module, modified)
+
+    def _is_int4mm(self, node: Node) -> bool:
+        """Check if node is an int4pack_mm operation.
+
+        Handles both standard torch ops and EdgeOpOverload (edge dialect).
+        """
+        if node.op != "call_function":
+            return False
+
+        target = node.target
+
+        # Direct match for standard torch op
+        if target == torch.ops.aten._weight_int4pack_mm.default:
+            return True
+
+        # Handle EdgeOpOverload (edge dialect wraps ops)
+        # Check if the target's name matches the int4pack_mm op
+        target_name = getattr(target, "_name", None) or getattr(target, "name", lambda: "")()
+        if "_weight_int4pack_mm" in str(target_name) or "_weight_int4pack_mm" in str(target):
+            return True
+
+        return False
+
+    def _get_params(self, node: Node) -> Optional[Tuple[Node, int, Node]]:
+        """
+        Extract parameters from int4mm node.
+
+        Returns:
+            (weight_node, block_size, scale_node) or None if invalid
+        """
+        if not self._is_int4mm(node) or len(node.args) < 4:
+            return None
+
+        w, bs, s = node.args[1], node.args[2], node.args[3]
+        if isinstance(w, Node) and isinstance(s, Node) and isinstance(bs, int):
+            return (w, bs, s)
+        return None
+
+    def _get_out_features(self, node: Node) -> Optional[int]:
+        """Extract output dimension from node metadata."""
+        val = node.meta.get("val")
+        return (
+            val.shape[-1]
+            if isinstance(val, torch.Tensor) and len(val.shape) >= 2
+            else None
+        )
+
+    def _validate_group(self, group: List[Node]) -> bool:
+        """
+        Validate that a group of operations can be safely fused.
+
+        Checks:
+        - Group size is within [min_fusion_size, max_fusion_size]
+        - All operations have compatible dtypes
+        - All operations have compatible devices
+        - All weights have the same input dimension (k)
+        """
+        if len(group) < self.min_fusion_size:
+            return False
+
+        has_metadata = all("val" in node.meta for node in group)
+
+        if has_metadata:
+            # Verify dtype compatibility
+            dtypes = {node.meta["val"].dtype for node in group}
+            if len(dtypes) > 1:
+                return False
+
+            # Verify device compatibility
+            devices = {str(node.meta["val"].device) for node in group}
+            if len(devices) > 1:
+                return False
+
+            # Verify output dimensions are extractable
+            if not all(self._get_out_features(node) for node in group):
+                return False
+
+        # Verify all operations have valid parameters
+        params_list = [self._get_params(n) for n in group]
+        if not all(params_list):
+            return False
+
+        weights = [p[0] for p in params_list]
+
+        # Verify weight input dimensions match (required for concatenation)
+        if has_metadata and all("val" in w.meta for w in weights):
+            k_dims = [w.meta["val"].shape[-1] for w in weights]
+            if len(set(k_dims)) > 1:
+                return False
+
+        return True
+
+    def _find_fuseable_groups(self, graph_module: GraphModule) -> List[List[Node]]:
+        """
+        Identify groups of int4mm operations that can be fused together.
+
+        Grouping strategy:
+        1. Iterate through all int4mm operations in the graph
+        2. Group operations by (input_node, block_size)
+           - Same input_node: operations consume the same preprocessed input
+           - Same block_size: required for compatible quantization
+        3. Split large groups into chunks of max_fusion_size
+        4. Validate each group before including it
+
+        Returns:
+            List of fuseable groups, where each group is a list of nodes
+        """
+        groups: Dict[Tuple[Node, int], List[Node]] = defaultdict(list)
+
+        for node in graph_module.graph.nodes:
+            if not self._is_int4mm(node):
+                continue
+
+            # Extract the immediate input node
+            if not node.args or not isinstance(node.args[0], Node):
+                continue
+            input_node = node.args[0]
+
+            # Extract block_size parameter
+            params = self._get_params(node)
+            if params:
+                groups[(input_node, params[1])].append(node)
+
+        # Split groups by max_fusion_size and validate
+        result = []
+        for ops in groups.values():
+            for i in range(0, len(ops), self.max_fusion_size):
+                group = ops[i : i + self.max_fusion_size]
+                if len(group) >= self.min_fusion_size and self._validate_group(group):
+                    result.append(group)
+
+        return result
+
+    def _get_last_placeholder(self, graph_module: GraphModule) -> Optional[Node]:
+        """Find the last placeholder node in the graph."""
+        last = None
+        for n in graph_module.graph.nodes:
+            if n.op == "placeholder":
+                last = n
+            else:
+                break
+        return last
+
+    def _compute_cat_metadata(
+        self, nodes: List[Node], dim: int
+    ) -> Optional[torch.Tensor]:
+        """
+        Compute metadata for concatenating tensors along a dimension.
+
+        Args:
+            nodes: Source nodes to concatenate
+            dim: Dimension to concatenate along (0 or 1)
+
+        Returns:
+            Fake tensor with concatenated shape, or None if metadata unavailable
+        """
+        if not all("val" in n.meta for n in nodes):
+            return None
+
+        # Get reference properties from first node
+        ref_val = nodes[0].meta["val"]
+        shapes = [n.meta["val"].shape for n in nodes]
+
+        # Compute concatenated shape
+        result_shape = list(shapes[0])
+        result_shape[dim] = sum(s[dim] for s in shapes)
+
+        # Use device='meta' to support dynamic shapes with SymInt.
+        # Concrete device objects (e.g., 'cuda:0') fail when shape dimensions
+        # are symbolic rather than concrete integers.
+        return torch.empty(
+            tuple(result_shape), dtype=ref_val.dtype, device="meta"
+        )
+
+    def _fuse_group(self, graph_module: GraphModule, group: List[Node]) -> bool:
+        """
+        Fuse a group of int4mm operations into a single operation.
+
+        Transformation:
+        1. Concatenate all weights along output dimension (dim=0)
+        2. Concatenate all scales along output dimension (dim=1)
+        3. Create single fused int4mm with concatenated weights/scales
+        4. Split fused output back to individual results
+        5. Replace original operations with split results
+
+        Args:
+            graph_module: The graph to modify
+            group: List of int4mm nodes to fuse
+
+        Returns:
+            True if fusion succeeded, False otherwise
+        """
+        try:
+            params = self._get_params(group[0])
+            if not params:
+                return False
+            _, block_size, _ = params
+
+            # Extract weights and scales from all operations
+            params_list = [self._get_params(n) for n in group]
+            weights = [p[0] for p in params_list]
+            scales = [p[2] for p in params_list]
+
+            # Compute output features once at the start for efficiency.
+            # These values are used in multiple places: fused_mm metadata,
+            # split_points calculation, and validation.
+            output_features = [self._get_out_features(n) for n in group]
+            if not all(output_features):
+                return False
+
+            # Create concatenated weights and scales.
+            # Insert before the FIRST original int4mm node to maintain topological order.
+            # The original int4mm nodes are already correctly placed after their inputs
+            # (shared_input, weights, scales), so inserting near them preserves validity.
+            first_int4mm = min(group, key=lambda n: list(graph_module.graph.nodes).index(n))
+            with graph_module.graph.inserting_before(first_int4mm):
+                # IMPORTANT: Use args, not kwargs, for cat operations.
+                # AOT Inductor expects positional arguments and may not correctly
+                # handle kwargs for aten::cat, leading to empty tensor outputs.
+                fused_weight = graph_module.graph.call_function(
+                    torch.ops.aten.cat.default,
+                    args=(weights, 0),
+                )
+                # Compute metadata for concatenated weights
+                if (val := self._compute_cat_metadata(weights, dim=0)) is not None:
+                    fused_weight.meta["val"] = val
+
+                fused_scale = graph_module.graph.call_function(
+                    torch.ops.aten.cat.default,
+                    args=(scales, 1),
+                )
+                # Compute metadata for concatenated scales
+                if (val := self._compute_cat_metadata(scales, dim=1)) is not None:
+                    fused_scale.meta["val"] = val
+
+                # Create fused matmul operation AFTER fused_weight and fused_scale
+                # to maintain topological order (fused_mm depends on fused_weight and fused_scale)
+                fused_mm = graph_module.graph.call_function(
+                    torch.ops.aten._weight_int4pack_mm.default,
+                    args=(group[0].args[0], fused_weight, block_size, fused_scale),
+                )
+
+                # Set output metadata with total concatenated output dimension.
+                # Use device='meta' to support dynamic shapes with SymInt.
+                if "val" in group[0].meta:
+                    base_shape = group[0].meta["val"].shape[:-1]
+                    total_out = sum(output_features)
+                    fused_mm.meta["val"] = torch.empty(
+                        base_shape + (total_out,),
+                        dtype=group[0].meta["val"].dtype,
+                        device='meta',
+                    )
+
+            # Calculate split points to divide the fused output back to individual results.
+            # For N operations, we need N-1 split points at cumulative output boundaries.
+            split_points = []
+            offset = 0
+            for out_feat in output_features[:-1]:
+                offset += out_feat
+                split_points.append(offset)
+
+            # Split fused output back to individual results
+            with graph_module.graph.inserting_after(fused_mm):
+                split_list = graph_module.graph.call_function(
+                    torch.ops.aten.tensor_split.indices,
+                    args=(fused_mm, split_points, -1),
+                )
+                # Set metadata for split operation (list of original output tensors)
+                if "val" in fused_mm.meta:
+                    split_list.meta["val"] = [n.meta["val"] for n in group if "val" in n.meta]
+
+            # Replace each original operation with its corresponding split result.
+            # IMPORTANT: tensor_split creates non-contiguous views with incorrect strides.
+            # For example, shape [batch, seq, hidden] gets strides [seq*3*hidden, 3*hidden, 1]
+            # instead of the expected [seq*hidden, hidden, 1]. This causes issues during
+            # AOTI compilation where kernels may assume contiguous memory layout.
+            # We add .contiguous() after each getitem to ensure proper memory layout.
+            for i, node in enumerate(group):
+                with graph_module.graph.inserting_after(split_list):
+                    getitem = graph_module.graph.call_function(
+                        operator.getitem,
+                        args=(split_list, i),
+                    )
+                    # Set metadata for getitem (non-contiguous view from tensor_split)
+                    if "val" in node.meta:
+                        getitem.meta["val"] = node.meta["val"]
+
+                # Add contiguous() AFTER getitem to ensure proper memory layout.
+                # This is critical for encoder patterns (seq_len > 1) where
+                # the non-contiguous strides from tensor_split would cause
+                # incorrect memory access in downstream operations.
+                with graph_module.graph.inserting_after(getitem):
+                    contiguous = graph_module.graph.call_function(
+                        torch.ops.aten.contiguous.default,
+                        args=(getitem,),
+                    )
+                    # Set metadata for contiguous output.
+                    # The output has the same shape but with proper contiguous strides.
+                    if "val" in node.meta:
+                        # Create a contiguous version of the metadata tensor
+                        orig_val = node.meta["val"]
+                        contiguous.meta["val"] = torch.empty(
+                            orig_val.shape,
+                            dtype=orig_val.dtype,
+                            device="meta",
+                        )
+
+                node.replace_all_uses_with(contiguous)
+
+            # Remove original operations
+            for node in group:
+                graph_module.graph.erase_node(node)
+
+            return True
+
+        except Exception as e:
+            # Log fusion failures with full context for debugging.
+            # Fusion is an optimization, so we gracefully skip failed groups,
+            # but we must provide visibility into failures to help developers
+            # identify and fix issues (e.g., graph structure problems, metadata bugs).
+            import logging
+            logger = logging.getLogger(__name__)
+            logger.warning(
+                f"Failed to fuse INT4 group of {len(group)} operations: {type(e).__name__}: {e}",
+                exc_info=True
+            )
+            return False
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 0cef859ddfb..059d6c0ea29 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -8,6 +8,7 @@
 
 #include <cuda_runtime.h>
 #include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/backend/options.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
@@ -16,6 +17,7 @@
 #include <filesystem>
 #include <fstream>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 // Include our shim layer headers
@@ -46,9 +48,88 @@ using executorch::runtime::Result;
 using executorch::runtime::Span;
 using executorch::runtime::etensor::Tensor;
 
+// Structure to hold a reference to a GPU tensor for "keep on device"
+// optimization. Owns the tensor handle - must be deleted when no longer needed.
+struct GpuTensorRef {
+  AOTITensorHandle handle; // Tensor handle (owned, for later deletion)
+  void* data_ptr; // GPU memory pointer (for D2D copy)
+  size_t size_bytes; // Total size in bytes
+};
+
 class ET_EXPERIMENTAL CudaBackend final
     : public ::executorch::runtime::BackendInterface {
  private:
+  // ============================================================================
+  // GPU Tensor Storage for D2D Copy Optimization
+  // ============================================================================
+  //
+  // This backend supports storing GPU tensors between execute() calls to enable
+  // device-to-device (D2D) copies instead of slower host-to-device (H2D)
+  // copies. This is useful for encoder-decoder models where the encoder output
+  // is reused across many decoder iterations.
+  //
+  // SUPPORTED OPTIONS (via set_option):
+  //
+  //   "store_output" (string): Store the output tensor under this name after
+  //       the next execute() call. The tensor remains on GPU until cleared.
+  //       Only supports single-output methods.
+  //       Example: opts.set_option("store_output", "encoder_output");
+  //
+  //   "use_stored_input" (string): For inputs matching the stored tensor's
+  //   size,
+  //       use D2D copy from the stored tensor instead of H2D copy from CPU.
+  //       This setting persists across execute() calls until reset.
+  //       Example: opts.set_option("use_stored_input", "encoder_output");
+  //
+  //   "reset_stored_input" (bool): Clear the use_stored_input setting.
+  //       Does NOT delete the stored tensor - only stops using it for D2D.
+  //       Example: opts.set_option("reset_stored_input", true);
+  //
+  //   "clear_stored_tensor" (string): Delete the named tensor from storage,
+  //       freeing GPU memory. Use after decoder loop completes.
+  //       Example: opts.set_option("clear_stored_tensor", "encoder_output");
+  //
+  // TYPICAL USAGE PATTERN (encoder-decoder model):
+  //
+  //   1. Before encoder: set_option("store_output", "encoder_output")
+  //   2. Execute encoder (output is stored on GPU)
+  //   3. Before decoder loop: set_option("use_stored_input", "encoder_output")
+  //   4. Execute decoder N times (D2D copies for encoder output input)
+  //   5. After decoder loop:
+  //        set_option("reset_stored_input", true)
+  //        set_option("clear_stored_tensor", "encoder_output")
+  //
+  // ============================================================================
+
+  // Storage control options (set via set_option before execute)
+  mutable std::string
+      store_output_name_; // Name to store output under (empty = none)
+  mutable std::string
+      use_stored_input_name_; // Name of stored tensor to use (empty = none)
+
+  // Per-instance map of named GPU tensor references.
+  // Mutable because execute() is const but needs to modify this.
+  //
+  // LIFETIME CONTRACT:
+  // - Stored tensors are valid until overwritten or destroy() is called.
+  // - Caller must ensure the producing execute() call (e.g., encoder) completes
+  //   before any consuming execute() call (e.g., decoder) begins.
+  // - Caller must not call destroy() while execute() is in progress.
+  // - Overwriting a tensor (same name) deletes the old tensor immediately,
+  //   so caller must ensure no concurrent execute() is using it.
+  mutable std::unordered_map<std::string, GpuTensorRef> gpu_tensors_;
+
+  // Helper to clear stored GPU tensors and free their memory.
+  // Only call when no execute() is in progress.
+  void clear_gpu_tensors() const {
+    for (auto& pair : gpu_tensors_) {
+      if (pair.second.handle != nullptr) {
+        aoti_torch_delete_tensor_object(pair.second.handle);
+      }
+    }
+    gpu_tensors_.clear();
+  }
+
   Error load_function_pointers_into_handle(
       void* so_handle,
       AOTIDelegateHandle* handle) const {
@@ -91,6 +172,70 @@ class ET_EXPERIMENTAL CudaBackend final
     return 1;
   }
 
+  Error set_option(
+      __ET_UNUSED executorch::runtime::BackendOptionContext& context,
+      const executorch::runtime::Span<executorch::runtime::BackendOption>&
+          backend_options) override {
+    for (size_t i = 0; i < backend_options.size(); i++) {
+      const auto& option = backend_options[i];
+      // Handle store_output: expects a string name (e.g., "encoder_output")
+      if (strcmp(option.key, "store_output") == 0) {
+        if (auto* arr = std::get_if<
+                std::array<char, executorch::runtime::kMaxOptionValueLength>>(
+                &option.value)) {
+          store_output_name_ = std::string(arr->data());
+        } else {
+          ET_LOG(Error, "store_output option expects a string value");
+          return Error::InvalidArgument;
+        }
+      }
+      // Handle use_stored_input: expects a string name (e.g., "encoder_output")
+      else if (strcmp(option.key, "use_stored_input") == 0) {
+        if (auto* arr = std::get_if<
+                std::array<char, executorch::runtime::kMaxOptionValueLength>>(
+                &option.value)) {
+          use_stored_input_name_ = std::string(arr->data());
+        } else {
+          ET_LOG(Error, "use_stored_input option expects a string value");
+          return Error::InvalidArgument;
+        }
+      }
+      // Handle reset_stored_input: expects a boolean value
+      // Note: This only resets the name setting. The stored GPU tensor
+      // remains in memory until overwritten or destroy() is called.
+      else if (strcmp(option.key, "reset_stored_input") == 0) {
+        if (auto* val = std::get_if<bool>(&option.value)) {
+          if (*val) {
+            use_stored_input_name_.clear();
+          }
+        } else {
+          ET_LOG(Error, "reset_stored_input option expects a boolean value");
+          return Error::InvalidArgument;
+        }
+      }
+      // Handle clear_stored_tensor: expects a string name
+      // Deletes the named GPU tensor from storage, freeing GPU memory.
+      else if (strcmp(option.key, "clear_stored_tensor") == 0) {
+        if (auto* arr = std::get_if<
+                std::array<char, executorch::runtime::kMaxOptionValueLength>>(
+                &option.value)) {
+          std::string name(arr->data());
+          auto it = gpu_tensors_.find(name);
+          if (it != gpu_tensors_.end()) {
+            if (it->second.handle != nullptr) {
+              aoti_torch_delete_tensor_object(it->second.handle);
+            }
+            gpu_tensors_.erase(it);
+          }
+        } else {
+          ET_LOG(Error, "clear_stored_tensor option expects a string value");
+          return Error::InvalidArgument;
+        }
+      }
+    }
+    return Error::Ok;
+  }
+
   // Once per loaded binary blob
   Result<DelegateHandle*> init(
       BackendInitContext& context,
@@ -222,15 +367,52 @@ class ET_EXPERIMENTAL CudaBackend final
     std::vector<AOTITensorHandle> gpu_outputs(
         n_outputs); // GPU tensors for kernel output
 
+    // RAII helper to ensure GPU tensors are cleaned up on all exit paths.
+    // Prevents memory leaks when errors occur during execute().
+    struct TensorCleanup {
+      std::vector<AOTITensorHandle>& inputs;
+      std::vector<AOTITensorHandle>& outputs;
+      const std::unordered_map<std::string, GpuTensorRef>& stored_tensors;
+
+      ~TensorCleanup() {
+        // Clean up input tensors
+        for (auto* handle : inputs) {
+          if (handle != nullptr) {
+            aoti_torch_delete_tensor_object(handle);
+          }
+        }
+        // Clean up output tensors, except those that are stored
+        for (auto* handle : outputs) {
+          if (handle != nullptr) {
+            bool is_stored = false;
+            for (const auto& pair : stored_tensors) {
+              if (pair.second.handle == handle) {
+                is_stored = true;
+                break;
+              }
+            }
+            if (!is_stored) {
+              aoti_torch_delete_tensor_object(handle);
+            }
+          }
+        }
+      }
+    };
+    TensorCleanup cleanup{gpu_inputs, gpu_outputs, gpu_tensors_};
+
+    // Track which input index was matched for D2D copy (for duplicate
+    // detection)
+    ssize_t matched_input_idx = -1;
+
     // Process input tensors: ExecuTorch provides CPU tensors, create GPU
-    // copies
-    for (int i = 0; i < n_inputs; i++) {
+    // copies. For stored inputs, use GPU-to-GPU copy instead of CPU-to-GPU.
+    for (size_t i = 0; i < n_inputs; i++) {
       // Get tensor dimensions and properties from ExecuTorch CPU tensor
       auto cpu_tensor = &(args[i]->toTensor());
       auto sizes = cpu_tensor->sizes();
       auto scalar_type = cpu_tensor->scalar_type();
 
-      // Create GPU tensor with same shape
+      // Create GPU tensor with same shape (always needed for AOTI format)
       std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
 
       AOTITensorHandle gpu_input_handle;
@@ -246,21 +428,75 @@ class ET_EXPERIMENTAL CudaBackend final
       ET_CHECK_OR_RETURN_ERROR(
           create_err == Error::Ok,
           Internal,
-          "Failed to create GPU tensor for input %d",
+          "Failed to create GPU tensor for input %zu",
           i);
 
       gpu_inputs[i] = gpu_input_handle;
 
-      // Copy data from CPU to GPU
+      // Check if this input matches a stored GPU tensor (by size).
+      if (!use_stored_input_name_.empty()) {
+        auto it = gpu_tensors_.find(use_stored_input_name_);
+        if (it != gpu_tensors_.end()) {
+          const GpuTensorRef& ref = it->second;
+          size_t numel = gpu_inputs[i]->numel();
+          size_t elem_size = gpu_inputs[i]->element_size();
+          size_t copy_bytes = numel * elem_size;
+
+          // Match by size: use stored tensor if sizes match
+          if (copy_bytes == ref.size_bytes) {
+            if (matched_input_idx >= 0) {
+              // Another input already matched - warn about ambiguity
+              ET_LOG(
+                  Error,
+                  "Multiple inputs match stored tensor '%s' size (%zu bytes): "
+                  "input %zd was used, input %zu also matches. "
+                  "Consider using unique tensor sizes or a different matching strategy.",
+                  use_stored_input_name_.c_str(),
+                  copy_bytes,
+                  matched_input_idx,
+                  i);
+            } else {
+              // First match - perform D2D copy
+              matched_input_idx = static_cast<ssize_t>(i);
+
+              ET_LOG(
+                  Debug,
+                  "Using stored tensor '%s' for input %zu (%zu bytes, D2D copy)",
+                  use_stored_input_name_.c_str(),
+                  i,
+                  copy_bytes);
+
+              // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format
+              cudaError_t cuda_err = cudaMemcpy(
+                  gpu_inputs[i]->data_ptr(),
+                  ref.data_ptr,
+                  copy_bytes,
+                  cudaMemcpyDeviceToDevice);
+
+              ET_CHECK_OR_RETURN_ERROR(
+                  cuda_err == cudaSuccess,
+                  Internal,
+                  "Failed GPU-to-GPU copy for input %zu: %s",
+                  i,
+                  cudaGetErrorString(cuda_err));
+
+              // Skip the CPU-to-GPU copy below
+              continue;
+            }
+          }
+        }
+      }
+
+      // Copy data from CPU to GPU (normal path)
       ET_CHECK_OR_RETURN_ERROR(
           aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok,
           Internal,
-          "Failed to copy input %d from CPU to GPU",
+          "Failed to copy input %zu from CPU to GPU",
           i);
     }
     // Process output tensors: create GPU counterparts for ExecuTorch CPU
     // tensors
-    for (int i = 0; i < n_outputs; i++) {
+    for (size_t i = 0; i < n_outputs; i++) {
       // Get output tensor dimensions from ExecuTorch CPU tensor
       auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
       auto sizes = cpu_output_tensor->sizes();
@@ -282,7 +518,7 @@ class ET_EXPERIMENTAL CudaBackend final
       ET_CHECK_OR_RETURN_ERROR(
           create_err == Error::Ok,
           Internal,
-          "Failed to create GPU tensor for output %d",
+          "Failed to create GPU tensor for output %zu",
           i);
 
       gpu_outputs[i] = gpu_output_handle;
@@ -303,20 +539,65 @@ class ET_EXPERIMENTAL CudaBackend final
         "AOTInductorModelContainerRun failed with error code %d",
         error);
 
+    // Store reference to output GPU tensor if requested.
+    // The tensor will be kept alive for later D2D copy to decoder inputs.
+    if (!store_output_name_.empty()) {
+      ET_CHECK_OR_RETURN_ERROR(
+          n_outputs == 1,
+          InvalidArgument,
+          "store_output only supports single-output methods, got %zu outputs",
+          n_outputs);
+
+      auto* gpu_tensor = gpu_outputs[0];
+      size_t numel = gpu_tensor->numel();
+      size_t elem_size = gpu_tensor->element_size();
+      size_t size_bytes = numel * elem_size;
+
+      // Delete old tensor if overwriting (erase first to prevent double-free)
+      auto old_it = gpu_tensors_.find(store_output_name_);
+      if (old_it != gpu_tensors_.end()) {
+        AOTITensorHandle old_handle = old_it->second.handle;
+        gpu_tensors_.erase(old_it); // Remove from map before deleting
+        if (old_handle != nullptr) {
+          aoti_torch_delete_tensor_object(old_handle);
+        }
+      }
+
+      // Store tensor reference (we now own this tensor)
+      GpuTensorRef ref;
+      ref.handle = gpu_tensor;
+      ref.data_ptr = gpu_tensor->data_ptr();
+      ref.size_bytes = size_bytes;
+      gpu_tensors_[store_output_name_] = ref;
+
+      // Reset store_output name after storing
+      store_output_name_.clear();
+    }
+
     // Copy GPU output results back to CPU output tensors
-    for (int i = 0; i < n_outputs; i++) {
+    for (size_t i = 0; i < n_outputs; i++) {
       auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
       // For DYNAMIC_BOUND tensors we try to resize
       ET_CHECK_OK_OR_RETURN_ERROR(
           resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()),
-          "Error resizing tensor at output index %d",
+          "Error resizing tensor at output index %zu",
           i);
       ET_CHECK_OK_OR_RETURN_ERROR(
           aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0),
-          "Failed to copy GPU output %d back to CPU",
+          "Failed to copy GPU output %zu back to CPU",
           i);
     }
 
+    // Memory management notes:
+    // - GPU tensor cleanup is handled by TensorCleanup RAII guard above.
+    // - use_stored_input setting persists across execute() calls to support
+    //   decoder loops that reuse the stored encoder output.
+    // - Stored GPU tensors (in gpu_tensors_) remain in memory until:
+    //   (a) overwritten by a new tensor with the same name, or
+    //   (b) destroy() is called, which frees all stored tensors.
+    // - The "reset_stored_input" option only resets the input name setting,
+    //   NOT the stored GPU tensors themselves.
+
     return Error::Ok;
   }
 
@@ -326,6 +607,9 @@ class ET_EXPERIMENTAL CudaBackend final
     }
     AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
 
+    // Delete stored GPU tensors
+    clear_gpu_tensors();
+
     // Destroy the CUDA stream if it exists
     if (handle->cuda_stream != nullptr) {
       cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);
diff --git a/backends/cuda/tests/test_cuda_export.py b/backends/cuda/tests/test_cuda_export.py
index ff4a9313545..d00e62c0268 100644
--- a/backends/cuda/tests/test_cuda_export.py
+++ b/backends/cuda/tests/test_cuda_export.py
@@ -325,3 +325,314 @@ def test_triton_kernel_mode_off(self):
             edge_program_manager,
             "SDPA kernel export with triton_kernel_mode=OFF failed",
         )
+
+    def test_whisper_decoder_int4_full_pass_chain(self):
+        """
+        Test CUDA export for Whisper-like decoder with INT4 quantization.
+
+        This test exercises the full CUDA backend pass chain:
+        1. CSEPass - Common subexpression elimination to merge preprocessing chains
+        2. FuseInt4WeightOnlyQuantMatmulPass - Fuses Q/K/V INT4 matmul operations
+        3. ReplaceEdgeOpWithTritonOpPass - Replaces SDPA with Triton kernels
+
+        The test creates a Whisper-like decoder layer with:
+        - Self-attention with Q/K/V projections (INT4 quantized, fuseable)
+        - Cross-attention with Q/K/V projections (INT4 quantized, fuseable)
+        - MLP with fc1/fc2 projections (INT4 quantized)
+        - SDPA for attention computation
+
+        This is a regression test to ensure the full pass chain works correctly,
+        particularly the _PermissiveVerifier fix that allows EdgeOpOverload,
+        OpOverloadPacket (triton.sdpa), and CustomOpDef types.
+        """
+        # Check for SM80+ (A100 or newer) required for INT4 tile_packed_to_4d format
+        if not torch.cuda.is_available():
+            self.skipTest("CUDA not available")
+
+        major, _ = torch.cuda.get_device_capability()
+        if major < 8:
+            self.skipTest("INT4 tile_packed_to_4d format requires SM80+ (A100 or newer)")
+
+        try:
+            from torchao.quantization import Int4WeightOnlyConfig, quantize_
+        except ImportError:
+            self.skipTest("torchao not available")
+
+        # Whisper decoder dimensions (from whisper-large-v3-turbo)
+        hidden_size = 1280
+        num_heads = 20
+        head_dim = hidden_size // num_heads  # 64
+        intermediate_size = hidden_size * 4  # 5120
+        group_size = 128
+
+        class WhisperDecoderLayer(torch.nn.Module):
+            """Simplified Whisper decoder layer for testing INT4 fusion."""
+
+            def __init__(self):
+                super().__init__()
+                # Self-attention projections (Q/K/V should be fused)
+                self.self_attn_q_proj = torch.nn.Linear(hidden_size, hidden_size, bias=True)
+                self.self_attn_k_proj = torch.nn.Linear(hidden_size, hidden_size, bias=False)
+                self.self_attn_v_proj = torch.nn.Linear(hidden_size, hidden_size, bias=True)
+                self.self_attn_out_proj = torch.nn.Linear(hidden_size, hidden_size, bias=True)
+
+                # Cross-attention projections (Q/K/V should be fused)
+                self.cross_attn_q_proj = torch.nn.Linear(hidden_size, hidden_size, bias=True)
+                self.cross_attn_k_proj = torch.nn.Linear(hidden_size, hidden_size, bias=False)
+                self.cross_attn_v_proj = torch.nn.Linear(hidden_size, hidden_size, bias=True)
+                self.cross_attn_out_proj = torch.nn.Linear(hidden_size, hidden_size, bias=True)
+
+                # MLP (fc1/fc2)
+                self.fc1 = torch.nn.Linear(hidden_size, intermediate_size, bias=True)
+                self.fc2 = torch.nn.Linear(intermediate_size, hidden_size, bias=True)
+
+                # Layer norms
+                self.self_attn_layer_norm = torch.nn.LayerNorm(hidden_size)
+                self.cross_attn_layer_norm = torch.nn.LayerNorm(hidden_size)
+                self.final_layer_norm = torch.nn.LayerNorm(hidden_size)
+
+            def forward(
+                self,
+                hidden_states: torch.Tensor,
+                encoder_hidden_states: torch.Tensor,
+            ) -> torch.Tensor:
+                batch_size, seq_len, _ = hidden_states.shape
+                encoder_seq_len = encoder_hidden_states.shape[1]
+
+                # Self-attention
+                residual = hidden_states
+                hidden_states = self.self_attn_layer_norm(hidden_states)
+
+                # Q/K/V projections (should be fused by FuseInt4WeightOnlyQuantMatmulPass)
+                q = self.self_attn_q_proj(hidden_states)
+                k = self.self_attn_k_proj(hidden_states)
+                v = self.self_attn_v_proj(hidden_states)
+
+                # Reshape for multi-head attention
+                q = q.view(batch_size, seq_len, num_heads, head_dim).transpose(1, 2)
+                k = k.view(batch_size, seq_len, num_heads, head_dim).transpose(1, 2)
+                v = v.view(batch_size, seq_len, num_heads, head_dim).transpose(1, 2)
+
+                # SDPA (should be replaced with triton.sdpa by ReplaceEdgeOpWithTritonOpPass)
+                attn_output = torch.nn.functional.scaled_dot_product_attention(
+                    q, k, v, attn_mask=None, dropout_p=0.0, is_causal=True
+                )
+
+                attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, hidden_size)
+                hidden_states = self.self_attn_out_proj(attn_output)
+                hidden_states = residual + hidden_states
+
+                # Cross-attention
+                residual = hidden_states
+                hidden_states = self.cross_attn_layer_norm(hidden_states)
+
+                # Cross Q/K/V projections (should be fused)
+                q = self.cross_attn_q_proj(hidden_states)
+                k = self.cross_attn_k_proj(encoder_hidden_states)
+                v = self.cross_attn_v_proj(encoder_hidden_states)
+
+                q = q.view(batch_size, seq_len, num_heads, head_dim).transpose(1, 2)
+                k = k.view(batch_size, encoder_seq_len, num_heads, head_dim).transpose(1, 2)
+                v = v.view(batch_size, encoder_seq_len, num_heads, head_dim).transpose(1, 2)
+
+                # Cross-attention SDPA
+                attn_output = torch.nn.functional.scaled_dot_product_attention(
+                    q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False
+                )
+
+                attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, hidden_size)
+                hidden_states = self.cross_attn_out_proj(attn_output)
+                hidden_states = residual + hidden_states
+
+                # MLP
+                residual = hidden_states
+                hidden_states = self.final_layer_norm(hidden_states)
+                hidden_states = self.fc1(hidden_states)
+                hidden_states = torch.nn.functional.gelu(hidden_states)
+                hidden_states = self.fc2(hidden_states)
+                hidden_states = residual + hidden_states
+
+                return hidden_states
+
+        # Create model with bfloat16 (required for SDPA with Triton)
+        module = WhisperDecoderLayer().to(dtype=torch.bfloat16, device="cuda")
+        module.eval()
+
+        # Apply INT4 quantization with tile_packed_to_4d format
+        int4_config = Int4WeightOnlyConfig(
+            group_size=group_size,
+            int4_packing_format="tile_packed_to_4d",
+        )
+        quantize_(module, int4_config)
+
+        # Prepare inputs
+        batch_size = 1
+        seq_len = 16
+        encoder_seq_len = 1500  # Whisper encoder output length
+
+        hidden_states = torch.randn(
+            batch_size, seq_len, hidden_size,
+            dtype=torch.bfloat16, device="cuda"
+        )
+        encoder_hidden_states = torch.randn(
+            batch_size, encoder_seq_len, hidden_size,
+            dtype=torch.bfloat16, device="cuda"
+        )
+
+        inputs = (hidden_states, encoder_hidden_states)
+
+        # Export and lower - this exercises the full pass chain
+        edge_program_manager = self._export_to_cuda_with_lower(module, inputs)
+
+        self.assertIsNotNone(
+            edge_program_manager,
+            "Whisper decoder INT4 export with full pass chain failed"
+        )
+
+    def test_whisper_encoder_int4_contiguous_outputs(self):
+        """
+        Regression test for non-contiguous tensor outputs in encoder pattern.
+
+        BUG: When the INT4 fusion pass fuses Q/K/V projections, it uses tensor_split
+        to divide the fused output back into separate Q/K/V tensors. tensor_split
+        creates non-contiguous views with incorrect strides:
+        - Expected strides for [batch, seq, hidden]: [seq*hidden, hidden, 1]
+        - Actual strides after split: [seq*3*hidden, 3*hidden, 1]
+
+        For encoder patterns with seq_len > 1, this causes:
+        - PyTorch's is_contiguous() to return False
+        - Kernels assuming contiguous layout to read wrong memory locations
+
+        For decoder patterns with seq_len=1, the bug doesn't manifest because
+        dim 1 has size 1, making stride[1] irrelevant for contiguity checks.
+
+        This test simulates a Whisper encoder layer processing a full audio sequence
+        (seq_len=1500) and verifies that Q/K/V outputs are contiguous after fusion
+        by checking the FakeTensor metadata (which is used during AOTI compilation).
+
+        THIS TEST SHOULD FAIL until the fix is applied (adding .contiguous() after split).
+        """
+        # Check for SM80+ (A100 or newer) required for INT4 tile_packed_to_4d format
+        if not torch.cuda.is_available():
+            self.skipTest("CUDA not available")
+
+        major, _ = torch.cuda.get_device_capability()
+        if major < 8:
+            self.skipTest("INT4 tile_packed_to_4d format requires SM80+ (A100 or newer)")
+
+        try:
+            from torchao.quantization import Int4WeightOnlyConfig, quantize_
+        except ImportError:
+            self.skipTest("torchao not available")
+
+        from executorch.exir import EdgeCompileConfig, to_edge
+        from executorch.exir.program._program import _update_exported_program_graph_module
+        from torch.export import export
+        from torch.fx.passes.dialect.common.cse_pass import CSEPass
+        from torch.fx.passes.infra.pass_base import PassResult
+
+        from executorch.backends.cuda.passes import FuseInt4WeightOnlyQuantMatmulPass
+
+        # Whisper encoder dimensions
+        hidden_size = 1280
+        group_size = 64  # Whisper encoder uses 64 for dimension 320
+        seq_len = 1500  # Encoder processes full audio sequence
+
+        class WhisperEncoderQKV(torch.nn.Module):
+            """
+            Simplified Whisper encoder attention projections.
+            Returns Q/K/V separately to check their contiguity.
+            """
+
+            def __init__(self):
+                super().__init__()
+                self.q_proj = torch.nn.Linear(hidden_size, hidden_size, bias=True)
+                self.k_proj = torch.nn.Linear(hidden_size, hidden_size, bias=False)
+                self.v_proj = torch.nn.Linear(hidden_size, hidden_size, bias=True)
+                self.layer_norm = torch.nn.LayerNorm(hidden_size)
+
+            def forward(self, hidden_states: torch.Tensor):
+                # Layer norm before attention (encoder pattern)
+                hidden_states = self.layer_norm(hidden_states)
+
+                # Q/K/V projections (should be fused)
+                q = self.q_proj(hidden_states)
+                k = self.k_proj(hidden_states)
+                v = self.v_proj(hidden_states)
+
+                return q, k, v
+
+        # Create model
+        module = WhisperEncoderQKV().to(dtype=torch.bfloat16, device="cuda")
+        module.eval()
+
+        # Apply INT4 quantization
+        int4_config = Int4WeightOnlyConfig(
+            group_size=group_size,
+            int4_packing_format="tile_packed_to_4d",
+        )
+        quantize_(module, int4_config)
+
+        # Create encoder-like input (full audio sequence)
+        x = torch.randn(1, seq_len, hidden_size, dtype=torch.bfloat16, device="cuda")
+
+        # Export to edge dialect
+        exported_program = export(module, (x,), strict=True)
+        edge_program = to_edge(
+            exported_program,
+            compile_config=EdgeCompileConfig(_check_ir_validity=False)
+        )
+
+        ep = edge_program.exported_program()
+
+        # Apply CSE pass
+        cse_result = CSEPass()(ep.graph_module)
+        if isinstance(cse_result, PassResult) and cse_result.modified:
+            ep = _update_exported_program_graph_module(
+                ep, cse_result.graph_module, override_verifiers=[]
+            )
+
+        # Apply fusion pass
+        fusion_result = FuseInt4WeightOnlyQuantMatmulPass()(ep.graph_module)
+        if isinstance(fusion_result, PassResult) and fusion_result.modified:
+            ep = _update_exported_program_graph_module(
+                ep, fusion_result.graph_module, override_verifiers=[]
+            )
+
+        # Verify fusion occurred by counting int4mm ops
+        int4mm_count = sum(
+            1 for node in ep.graph_module.graph.nodes
+            if node.op == "call_function" and "_weight_int4pack_mm" in str(node.target)
+        )
+        self.assertEqual(int4mm_count, 1, "Expected Q/K/V fusion (3->1)")
+
+        # Check FakeTensor metadata for contiguous nodes (after the fix is applied)
+        # The fusion pass now adds .contiguous() after each getitem to ensure
+        # proper memory layout for AOTI compilation.
+        contiguous_metadata = []
+        for node in ep.graph_module.graph.nodes:
+            if node.op == "call_function" and "contiguous" in str(node.target):
+                if "val" in node.meta:
+                    val = node.meta["val"]
+                    if isinstance(val, torch.Tensor):
+                        contiguous_metadata.append({
+                            "name": node.name,
+                            "shape": tuple(val.shape),
+                            "stride": tuple(val.stride()),
+                            "is_contiguous": val.is_contiguous(),
+                        })
+
+        # After the fix, there should be contiguous nodes with proper metadata
+        self.assertGreater(len(contiguous_metadata), 0, "Expected contiguous nodes after fusion (fix applied)")
+
+        for meta in contiguous_metadata:
+            # Check that FakeTensor metadata shows contiguous tensors
+            self.assertTrue(
+                meta["is_contiguous"],
+                f"Encoder FakeTensor for {meta['name']} (seq_len={seq_len}) should be contiguous.\n"
+                f"Shape: {meta['shape']}, Strides: {meta['stride']}"
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/cuda/tests/test_fuse_int4_quant_matmul.py b/backends/cuda/tests/test_fuse_int4_quant_matmul.py
new file mode 100644
index 00000000000..ff8be0f0db5
--- /dev/null
+++ b/backends/cuda/tests/test_fuse_int4_quant_matmul.py
@@ -0,0 +1,1176 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Unit tests for FuseInt4WeightOnlyQuantMatmulPass.
+
+Tests the fusion pass that combines multiple int4pack_mm operations sharing
+the same input into a single fused operation.
+"""
+
+import unittest
+from typing import Tuple
+
+import torch
+from executorch.backends.cuda.passes import FuseInt4WeightOnlyQuantMatmulPass
+from executorch.exir.program._program import _update_exported_program_graph_module
+from torch.export import Dim, export
+from torch.fx import GraphModule
+from torch.fx.passes.dialect.common.cse_pass import CSEPass
+from torch.fx.passes.infra.pass_base import PassResult
+
+
+class TestFuseInt4QuantMatmul(unittest.TestCase):
+    """Test FuseInt4WeightOnlyQuantMatmulPass public interface."""
+
+    def setUp(self):
+        """Set up test environment."""
+        if not torch.cuda.is_available():
+            self.skipTest("CUDA is not available")
+
+        # Check if torchao is available
+        try:
+            from torchao.quantization.quantize_.workflows.int4.int4_tile_packed_to_4d_tensor import (
+                Int4TilePackedTo4dTensor,
+            )
+            from torchao.quantization.utils import pack_tinygemm_scales_and_zeros
+            from torchao.utils import find_multiple
+        except ImportError:
+            self.skipTest("torchao is not available")
+
+    def _create_int4_weight(
+        self, out_features: int, in_features: int, block_size: int = 128
+    ):
+        """
+        Create Int4TilePackedTo4dTensor for testing.
+
+        This creates a proper quantized weight tensor that will use
+        torch.ops.aten._weight_int4pack_mm when used in nn.functional.linear.
+        """
+        from torchao.quantization.quantize_.workflows.int4.int4_tile_packed_to_4d_tensor import (
+            Int4TilePackedTo4dTensor,
+        )
+        from torchao.quantization.utils import pack_tinygemm_scales_and_zeros
+        from torchao.utils import find_multiple
+
+        device = "cuda"
+        inner_k_tiles = 8
+
+        # Pad dimensions to required multiples
+        in_features_padded = find_multiple(in_features, 1024)
+        out_features_padded = find_multiple(out_features, 8)
+
+        # Create INT4 values in range [-8, 7]
+        int4_values = torch.randint(
+            -8, 8, (out_features, in_features), dtype=torch.int8, device=device
+        )
+
+        # Create scales
+        num_blocks = in_features // block_size
+        scales = (
+            torch.randn(out_features, num_blocks, dtype=torch.bfloat16, device=device)
+            * 0.01
+        )
+
+        # Pad int4 values
+        int4_padded = torch.nn.functional.pad(
+            int4_values,
+            (
+                0,
+                in_features_padded - in_features,
+                0,
+                out_features_padded - out_features,
+            ),
+            value=0,
+        )
+
+        # Pad scales
+        num_blocks_padded = in_features_padded // block_size
+        scales_padded = torch.nn.functional.pad(
+            scales,
+            (0, num_blocks_padded - num_blocks, 0, out_features_padded - out_features),
+            value=1.0,
+        )
+
+        # Convert to unsigned [0, 15]
+        int4_shifted = (int4_padded + 8).to(torch.int32).clamp(0, 15)
+
+        # Pack two INT4 values per uint8
+        int_data_packed = (int4_shifted[:, ::2] << 4 | int4_shifted[:, 1::2]).to(
+            torch.uint8
+        )
+
+        # Convert to tinygemm format
+        packed_weight = torch.ops.aten._convert_weight_to_int4pack(
+            int_data_packed.contiguous(), inner_k_tiles
+        )
+
+        # Create scale_and_zero
+        zero_points = torch.zeros_like(scales_padded, dtype=scales_padded.dtype)
+        scale_and_zero = pack_tinygemm_scales_and_zeros(
+            scales_padded.reshape(out_features_padded, -1),
+            zero_points.reshape(out_features_padded, -1),
+            scales.dtype,
+        )
+
+        return Int4TilePackedTo4dTensor(
+            qdata=packed_weight,
+            scale_and_zero=scale_and_zero,
+            block_size=[1, block_size],
+            shape=(out_features, in_features),
+            act_pre_scale=None,
+        )
+
+    def _apply_fusion_pipeline(self, exported_program, min_fusion_size=2, max_fusion_size=3):
+        """
+        Apply CSE + Fusion passes matching cuda_backend.py preprocessing.
+
+        This tests the integration of CSEPass and FuseInt4WeightOnlyQuantMatmulPass
+        using the same pattern as CudaBackend.preprocess().
+        """
+        # CSE pass (required to create fuseable patterns)
+        cse_pass = CSEPass()
+        cse_result = cse_pass(exported_program.graph_module)
+        if cse_result.modified:
+            exported_program = _update_exported_program_graph_module(
+                exported_program, cse_result.graph_module
+            )
+
+        # Fusion pass
+        fusion_pass = FuseInt4WeightOnlyQuantMatmulPass(
+            min_fusion_size=min_fusion_size, max_fusion_size=max_fusion_size
+        )
+        fusion_result = fusion_pass(exported_program.graph_module)
+        if fusion_result.modified:
+            exported_program = _update_exported_program_graph_module(
+                exported_program, fusion_result.graph_module
+            )
+
+        return exported_program
+
+    def _count_int4mm_ops(self, graph_module: GraphModule) -> int:
+        """Count number of _weight_int4pack_mm operations in graph.
+
+        Handles both standard torch ops and EdgeOpOverload (edge dialect).
+        """
+        count = 0
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function":
+                continue
+            target = node.target
+            # Direct match for standard torch op
+            if target == torch.ops.aten._weight_int4pack_mm.default:
+                count += 1
+            # Handle EdgeOpOverload (edge dialect wraps ops)
+            elif "_weight_int4pack_mm" in str(target):
+                count += 1
+        return count
+
+    def _build_qkv_model(
+        self, hidden_dim: int = 2048, block_size: int = 128
+    ) -> torch.nn.Module:
+        """
+        Build a model with Q/K/V projections pattern (3 int4mm ops sharing input).
+
+        This simulates attention projection layers that should be fused 3→1.
+        """
+
+        class QKVModel(torch.nn.Module):
+            def __init__(self, create_weight_fn):
+                super().__init__()
+                # Create linear layers with Int4TilePackedTo4dTensor weights
+                self.q_proj = torch.nn.Linear(hidden_dim, hidden_dim, bias=False, device="cuda")
+                self.k_proj = torch.nn.Linear(hidden_dim, hidden_dim, bias=False, device="cuda")
+                self.v_proj = torch.nn.Linear(hidden_dim, hidden_dim, bias=False, device="cuda")
+
+                # Replace weights with INT4 quantized tensors
+                self.q_proj.weight = torch.nn.Parameter(
+                    create_weight_fn(hidden_dim, hidden_dim, block_size), requires_grad=False
+                )
+                self.k_proj.weight = torch.nn.Parameter(
+                    create_weight_fn(hidden_dim, hidden_dim, block_size), requires_grad=False
+                )
+                self.v_proj.weight = torch.nn.Parameter(
+                    create_weight_fn(hidden_dim, hidden_dim, block_size), requires_grad=False
+                )
+
+            def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+                # Three int4mm operations sharing the same input
+                q = torch.nn.functional.linear(x, self.q_proj.weight)
+                k = torch.nn.functional.linear(x, self.k_proj.weight)
+                v = torch.nn.functional.linear(x, self.v_proj.weight)
+                return q, k, v
+
+        return QKVModel(self._create_int4_weight).eval()
+
+    def _build_gate_up_model(
+        self, hidden_dim: int = 2048, intermediate_dim: int = 8192, block_size: int = 128
+    ) -> torch.nn.Module:
+        """
+        Build a model with Gate/Up projection pattern (2 int4mm ops sharing input).
+
+        This simulates MLP layers that should be fused 2→1.
+        """
+
+        class GateUpModel(torch.nn.Module):
+            def __init__(self, create_weight_fn):
+                super().__init__()
+                # Create linear layers with Int4TilePackedTo4dTensor weights
+                self.gate_proj = torch.nn.Linear(
+                    hidden_dim, intermediate_dim, bias=False, device="cuda"
+                )
+                self.up_proj = torch.nn.Linear(
+                    hidden_dim, intermediate_dim, bias=False, device="cuda"
+                )
+
+                # Replace weights with INT4 quantized tensors
+                self.gate_proj.weight = torch.nn.Parameter(
+                    create_weight_fn(intermediate_dim, hidden_dim, block_size),
+                    requires_grad=False,
+                )
+                self.up_proj.weight = torch.nn.Parameter(
+                    create_weight_fn(intermediate_dim, hidden_dim, block_size),
+                    requires_grad=False,
+                )
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                # Two int4mm operations sharing the same input
+                gate = torch.nn.functional.linear(x, self.gate_proj.weight)
+                up = torch.nn.functional.linear(x, self.up_proj.weight)
+                return gate * up
+
+        return GateUpModel(self._create_int4_weight).eval()
+
+    def _build_different_inputs_model(
+        self, hidden_dim: int = 2048, block_size: int = 128
+    ) -> torch.nn.Module:
+        """
+        Build a model with operations on different inputs (should NOT fuse).
+        """
+
+        class DifferentInputsModel(torch.nn.Module):
+            def __init__(self, create_weight_fn):
+                super().__init__()
+                # Create linear layers with Int4TilePackedTo4dTensor weights
+                self.linear1 = torch.nn.Linear(hidden_dim, hidden_dim, bias=False, device="cuda")
+                self.linear2 = torch.nn.Linear(hidden_dim, hidden_dim, bias=False, device="cuda")
+
+                # Replace weights with INT4 quantized tensors
+                self.linear1.weight = torch.nn.Parameter(
+                    create_weight_fn(hidden_dim, hidden_dim, block_size), requires_grad=False
+                )
+                self.linear2.weight = torch.nn.Parameter(
+                    create_weight_fn(hidden_dim, hidden_dim, block_size), requires_grad=False
+                )
+
+            def forward(
+                self, x: torch.Tensor, y: torch.Tensor
+            ) -> Tuple[torch.Tensor, torch.Tensor]:
+                # Two operations with different inputs - should NOT fuse
+                out1 = torch.nn.functional.linear(x, self.linear1.weight)
+                out2 = torch.nn.functional.linear(y, self.linear2.weight)
+                return out1, out2
+
+        return DifferentInputsModel(self._create_int4_weight).eval()
+
+    def _build_cross_attention_model(
+        self, decoder_dim: int = 1280, encoder_dim: int = 1280, block_size: int = 128
+    ) -> torch.nn.Module:
+        """
+        Build a model with cross-attention pattern (Whisper-like).
+
+        Pattern: Q from decoder, K/V from encoder (K/V should fuse 2→1, Q separate).
+        This simulates Whisper decoder cross-attention where K/V share encoder
+        output but Q uses decoder hidden state.
+        """
+
+        class CrossAttentionModel(torch.nn.Module):
+            def __init__(self, create_weight_fn):
+                super().__init__()
+                # Q projection from decoder hidden state
+                self.q_proj = torch.nn.Linear(decoder_dim, decoder_dim, bias=False, device="cuda")
+                # K/V projections from encoder output
+                self.k_proj = torch.nn.Linear(encoder_dim, decoder_dim, bias=False, device="cuda")
+                self.v_proj = torch.nn.Linear(encoder_dim, decoder_dim, bias=False, device="cuda")
+
+                # Replace weights with INT4 quantized tensors
+                self.q_proj.weight = torch.nn.Parameter(
+                    create_weight_fn(decoder_dim, decoder_dim, block_size), requires_grad=False
+                )
+                self.k_proj.weight = torch.nn.Parameter(
+                    create_weight_fn(decoder_dim, encoder_dim, block_size), requires_grad=False
+                )
+                self.v_proj.weight = torch.nn.Parameter(
+                    create_weight_fn(decoder_dim, encoder_dim, block_size), requires_grad=False
+                )
+
+            def forward(
+                self, decoder_hidden: torch.Tensor, encoder_output: torch.Tensor
+            ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+                # Cross-attention: Q from decoder, K/V from encoder
+                q = torch.nn.functional.linear(decoder_hidden, self.q_proj.weight)
+                k = torch.nn.functional.linear(encoder_output, self.k_proj.weight)
+                v = torch.nn.functional.linear(encoder_output, self.v_proj.weight)
+                return q, k, v
+
+        return CrossAttentionModel(self._create_int4_weight).eval()
+
+    def _build_sequential_model(
+        self, hidden_dim: int = 2048, intermediate_dim: int = 8192, block_size: int = 128
+    ) -> torch.nn.Module:
+        """
+        Build a model with sequential operations (should NOT fuse).
+
+        This simulates Whisper MLP: fc1 → GELU → fc2 (sequential chain).
+        """
+
+        class SequentialModel(torch.nn.Module):
+            def __init__(self, create_weight_fn):
+                super().__init__()
+                # Sequential MLP layers
+                self.fc1 = torch.nn.Linear(
+                    hidden_dim, intermediate_dim, bias=False, device="cuda"
+                )
+                self.fc2 = torch.nn.Linear(
+                    intermediate_dim, hidden_dim, bias=False, device="cuda"
+                )
+
+                # Replace weights with INT4 quantized tensors
+                self.fc1.weight = torch.nn.Parameter(
+                    create_weight_fn(intermediate_dim, hidden_dim, block_size),
+                    requires_grad=False,
+                )
+                self.fc2.weight = torch.nn.Parameter(
+                    create_weight_fn(hidden_dim, intermediate_dim, block_size),
+                    requires_grad=False,
+                )
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                # Sequential operations: fc2 depends on fc1 output
+                x = torch.nn.functional.linear(x, self.fc1.weight)
+                x = torch.nn.functional.gelu(x)
+                x = torch.nn.functional.linear(x, self.fc2.weight)
+                return x
+
+        return SequentialModel(self._create_int4_weight).eval()
+
+    def test_fuse_qkv_projection(self):
+        """Test fusion of Q/K/V projections (3→1 operation)."""
+        model = self._build_qkv_model(hidden_dim=2048, block_size=128)
+        example_input = (torch.randn(4, 128, 2048, device="cuda", dtype=torch.bfloat16),)
+
+        # Export model
+        exported = export(model, example_input, strict=False)
+        ops_before = self._count_int4mm_ops(exported.graph_module)
+        self.assertEqual(ops_before, 3, "Expected 3 int4mm operations before fusion")
+
+        # Apply CSE + Fusion pipeline
+        result = self._apply_fusion_pipeline(exported)
+
+        # Verify fusion: 3 → 1
+        ops_after = self._count_int4mm_ops(result.graph_module)
+        self.assertEqual(ops_after, 1, "Expected 1 int4mm operation after fusion (3→1)")
+        result.graph_module.graph.lint()
+
+    def test_fuse_gate_up_projection(self):
+        """Test fusion of Gate/Up projections (2→1 operation)."""
+        model = self._build_gate_up_model(
+            hidden_dim=2048, intermediate_dim=8192, block_size=128
+        )
+        example_input = (torch.randn(4, 128, 2048, device="cuda", dtype=torch.bfloat16),)
+
+        # Export model
+        exported = export(model, example_input, strict=False)
+        ops_before = self._count_int4mm_ops(exported.graph_module)
+        self.assertEqual(ops_before, 2, "Expected 2 int4mm operations before fusion")
+
+        # Apply CSE + Fusion pipeline
+        result = self._apply_fusion_pipeline(exported)
+
+        # Verify fusion: 2 → 1
+        ops_after = self._count_int4mm_ops(result.graph_module)
+        self.assertEqual(ops_after, 1, "Expected 1 int4mm operation after fusion (2→1)")
+        result.graph_module.graph.lint()
+
+    def test_no_fusion_different_inputs(self):
+        """Test that operations with different inputs are NOT fused."""
+        model = self._build_different_inputs_model(hidden_dim=2048, block_size=128)
+        example_inputs = (
+            torch.randn(4, 128, 2048, device="cuda", dtype=torch.bfloat16),
+            torch.randn(4, 128, 2048, device="cuda", dtype=torch.bfloat16),
+        )
+
+        # Export model
+        exported = export(model, example_inputs, strict=False)
+        ops_before = self._count_int4mm_ops(exported.graph_module)
+        self.assertEqual(ops_before, 2, "Expected 2 int4mm operations before fusion")
+
+        # Apply CSE + Fusion pipeline
+        result = self._apply_fusion_pipeline(exported)
+
+        # Verify NO fusion (different inputs)
+        ops_after = self._count_int4mm_ops(result.graph_module)
+        self.assertEqual(ops_after, 2, "Expected 2 int4mm operations (unchanged)")
+
+    def test_respects_min_fusion_size(self):
+        """Test that fusion respects min_fusion_size parameter."""
+        model = self._build_qkv_model(hidden_dim=2048, block_size=128)
+        example_input = (torch.randn(4, 128, 2048, device="cuda", dtype=torch.bfloat16),)
+
+        # Export model
+        exported = export(model, example_input, strict=False)
+
+        # Apply pipeline with min_fusion_size=4 (should NOT fuse 3 ops)
+        result = self._apply_fusion_pipeline(exported, min_fusion_size=4, max_fusion_size=5)
+
+        # Verify NO fusion (below min_fusion_size)
+        ops_after = self._count_int4mm_ops(result.graph_module)
+        self.assertEqual(ops_after, 3, "Expected 3 int4mm operations (unchanged)")
+
+    def test_dynamic_shapes_with_symint(self):
+        """
+        Test fusion with dynamic shapes (SymInt).
+
+        Critical for models with dynamic sequence lengths (Voxtral, other LLMs).
+        """
+        model = self._build_qkv_model(hidden_dim=3072, block_size=128)
+
+        # Export with dynamic shapes
+        seq_length = 3
+        inputs_embeds = torch.randn(1, seq_length, 3072, device="cuda", dtype=torch.bfloat16)
+        seq_len_dim = Dim("seq_length_dim", max=128)
+        dynamic_shapes = ({1: seq_len_dim},)
+
+        exported = export(model, (inputs_embeds,), dynamic_shapes=dynamic_shapes, strict=False)
+        ops_before = self._count_int4mm_ops(exported.graph_module)
+        self.assertEqual(ops_before, 3, "Expected 3 int4mm operations before fusion")
+
+        # Apply CSE + Fusion pipeline (should handle SymInt correctly with device='meta')
+        result = self._apply_fusion_pipeline(exported)
+
+        # Verify fusion works with dynamic shapes
+        ops_after = self._count_int4mm_ops(result.graph_module)
+        self.assertEqual(
+            ops_after, 1, "Expected 1 int4mm operation after fusion with dynamic shapes"
+        )
+        result.graph_module.graph.lint()
+
+    def test_preserves_graph_validity(self):
+        """Test that fusion preserves graph validity and metadata."""
+        model = self._build_qkv_model(hidden_dim=2048, block_size=128)
+        example_input = (torch.randn(4, 128, 2048, device="cuda", dtype=torch.bfloat16),)
+
+        # Export and apply fusion
+        exported = export(model, example_input, strict=False)
+        result = self._apply_fusion_pipeline(exported)
+
+        # Verify graph validity
+        result.graph_module.graph.lint()  # Should not raise
+        code = result.graph_module.code
+        self.assertIsNotNone(code, "Fused graph should generate code")
+
+        # Verify all function nodes have targets
+        for node in result.graph_module.graph.nodes:
+            if node.op == "call_function":
+                self.assertIsNotNone(node.target, f"Node {node.name} missing target")
+
+    def test_fuse_cross_attention_kv(self):
+        """
+        Test fusion of cross-attention K/V projections (Whisper pattern).
+
+        Cross-attention pattern: Q from decoder, K/V from encoder.
+        Expected: K/V fuse (2→1), Q stays separate (total 3→2).
+        """
+        model = self._build_cross_attention_model(
+            decoder_dim=1280, encoder_dim=1280, block_size=128
+        )
+        decoder_hidden = torch.randn(4, 128, 1280, device="cuda", dtype=torch.bfloat16)
+        encoder_output = torch.randn(4, 256, 1280, device="cuda", dtype=torch.bfloat16)
+        example_inputs = (decoder_hidden, encoder_output)
+
+        # Export model
+        exported = export(model, example_inputs, strict=False)
+        ops_before = self._count_int4mm_ops(exported.graph_module)
+        self.assertEqual(ops_before, 3, "Expected 3 int4mm operations before fusion")
+
+        # Apply CSE + Fusion pipeline
+        result = self._apply_fusion_pipeline(exported)
+
+        # Verify partial fusion: K/V fuse (2→1), Q separate = 2 ops total
+        ops_after = self._count_int4mm_ops(result.graph_module)
+        self.assertEqual(
+            ops_after, 2, "Expected 2 int4mm operations after fusion (K/V fused, Q separate)"
+        )
+        result.graph_module.graph.lint()
+
+    def test_zero_copy_split_no_materialization(self):
+        """
+        Test that fused QKV split does not introduce memory copies.
+
+        After fusion, the split operation should create views (zero-copy) rather
+        than materialized copies. This test verifies that:
+        1. Split operations exist in the graph (tensor_split or slice)
+        2. NO contiguous/clone/copy operations are inserted after split
+        3. The graph structure supports zero-copy views
+
+        This is critical for performance: materialization adds 29% overhead,
+        while zero-copy split provides 17% speedup (per roofline analysis).
+        """
+        model = self._build_qkv_model(hidden_dim=1280, block_size=128)
+        example_input = (torch.randn(1, 128, 1280, device="cuda", dtype=torch.bfloat16),)
+
+        # Export and apply fusion
+        exported = export(model, example_input, strict=False)
+        result = self._apply_fusion_pipeline(exported)
+
+        # Verify fusion occurred (3→1)
+        ops_after = self._count_int4mm_ops(result.graph_module)
+        self.assertEqual(ops_after, 1, "Expected 1 fused int4mm operation")
+
+        # Track split operations and potential materializations
+        has_tensor_split = False
+        has_slice_ops = False
+        materialization_ops = []
+        split_output_nodes = set()
+        fused_int4mm_node = None
+
+        # First pass: identify fused int4mm and split operations
+        for node in result.graph_module.graph.nodes:
+            if node.op == "call_function":
+                # Find the fused int4mm node
+                if node.target == torch.ops.aten._weight_int4pack_mm.default:
+                    fused_int4mm_node = node
+
+                # Check for split operations
+                if "tensor_split" in str(node.target):
+                    has_tensor_split = True
+                    split_output_nodes.add(node)
+                    # All users of tensor_split are getitem nodes
+                    for user in node.users:
+                        split_output_nodes.add(user)
+                elif node.target == torch.ops.aten.slice.Tensor:
+                    has_slice_ops = True
+                    split_output_nodes.add(node)
+
+        # The fused int4mm output is used by either tensor_split or slice operations
+        # Check the users of the fused int4mm node
+        if fused_int4mm_node:
+            for user in fused_int4mm_node.users:
+                target_str = str(user.target) if user.op == "call_function" else ""
+                if "tensor_split" in target_str or "split" in target_str:
+                    has_tensor_split = True
+                    split_output_nodes.add(user)
+                    # Add getitem users
+                    for getitem_user in user.users:
+                        split_output_nodes.add(getitem_user)
+                elif user.target == torch.ops.aten.slice.Tensor:
+                    has_slice_ops = True
+                    split_output_nodes.add(user)
+
+        # Verify we have split operations
+        self.assertTrue(
+            has_tensor_split or has_slice_ops,
+            "Expected tensor_split or slice operations in fused graph"
+        )
+
+        # Second pass: check for materialization operations AFTER split
+        # These would indicate forced memory copies
+        for node in result.graph_module.graph.nodes:
+            if node.op == "call_function":
+                # Check if this op acts on split outputs
+                is_downstream_of_split = any(
+                    arg in split_output_nodes
+                    for arg in node.args
+                    if isinstance(arg, torch.fx.Node)
+                )
+
+                if is_downstream_of_split:
+                    # Check for operations that force materialization
+                    target_str = str(node.target)
+                    if any(
+                        op in target_str.lower()
+                        for op in ["contiguous", "clone", "copy", "_copy"]
+                    ):
+                        materialization_ops.append((node.name, target_str))
+
+        # Assert no forced materializations
+        self.assertEqual(
+            len(materialization_ops),
+            0,
+            f"Found materialization operations after split: {materialization_ops}\n"
+            f"This indicates the split is NOT zero-copy and will hurt performance.\n"
+            f"Expected: Views only (tensor_split or slice)\n"
+            f"Found: {materialization_ops}",
+        )
+
+        # Verify graph structure
+        result.graph_module.graph.lint()
+
+        # Additional verification: count split-related ops
+        split_op_count = sum(
+            1
+            for node in result.graph_module.graph.nodes
+            if node.op == "call_function"
+            and (
+                "tensor_split" in str(node.target)
+                or node.target == torch.ops.aten.slice.Tensor
+            )
+        )
+
+        if has_tensor_split:
+            # tensor_split: 1 split node + 3 getitem nodes
+            getitem_count = sum(
+                1
+                for node in result.graph_module.graph.nodes
+                if node.op == "call_function" and "getitem" in str(node.target)
+            )
+            # We should have exactly 1 tensor_split
+            self.assertGreaterEqual(split_op_count, 1, "Expected at least 1 tensor_split operation")
+            # And 3 getitem nodes (Q, K, V)
+            self.assertEqual(getitem_count, 3, "Expected 3 getitem operations for Q/K/V")
+        else:
+            # Explicit slicing: 3 slice operations (one each for Q, K, V)
+            self.assertEqual(split_op_count, 3, "Expected 3 slice operations for Q/K/V")
+
+    def test_no_fusion_sequential_ops(self):
+        """
+        Test that sequential operations do NOT fuse (Whisper MLP pattern).
+
+        Sequential pattern: fc1 → GELU → fc2 (fc2 depends on fc1 output).
+        Expected: 2→2 (no fusion, different inputs).
+        """
+        model = self._build_sequential_model(
+            hidden_dim=1280, intermediate_dim=5120, block_size=128
+        )
+        example_input = (torch.randn(4, 128, 1280, device="cuda", dtype=torch.bfloat16),)
+
+        # Export model
+        exported = export(model, example_input, strict=False)
+        ops_before = self._count_int4mm_ops(exported.graph_module)
+        self.assertEqual(ops_before, 2, "Expected 2 int4mm operations before fusion")
+
+        # Apply CSE + Fusion pipeline
+        result = self._apply_fusion_pipeline(exported)
+
+        # Verify NO fusion (sequential operations have different inputs)
+        ops_after = self._count_int4mm_ops(result.graph_module)
+        self.assertEqual(
+            ops_after, 2, "Expected 2 int4mm operations (no fusion for sequential ops)"
+        )
+        result.graph_module.graph.lint()
+
+    def test_aoti_backend_pass_application(self):
+        """
+        Regression test for AotiBackend.preprocess pass application.
+
+        Tests that CudaBackend.get_custom_passes() returns passes that work
+        correctly with the AotiBackend.preprocess pass application loop,
+        specifically testing that PassResult is properly handled and
+        _update_exported_program_graph_module is called when passes modify
+        the graph.
+
+        This test exercises the exact same code path as AotiBackend.preprocess()
+        lines 158-166, ensuring that:
+        1. get_custom_passes() returns CSEPass and FuseInt4WeightOnlyQuantMatmulPass
+        2. Passes returning PassResult with modified=True trigger graph update
+        3. The exported program is properly updated after each pass
+        4. Fusion actually occurs (int4mm ops are reduced)
+        """
+        from executorch.backends.cuda.cuda_backend import CudaBackend
+        from executorch.exir.backend.compile_spec_schema import CompileSpec
+
+        # Build model with fuseable Q/K/V pattern
+        model = self._build_qkv_model(hidden_dim=2048, block_size=128)
+        example_input = (torch.randn(4, 128, 2048, device="cuda", dtype=torch.bfloat16),)
+
+        # Export model
+        exported = export(model, example_input, strict=False)
+        ops_before = self._count_int4mm_ops(exported.graph_module)
+        self.assertEqual(ops_before, 3, "Expected 3 int4mm operations before fusion")
+
+        # Get custom passes from CudaBackend (same as preprocess does)
+        compile_specs = [
+            CompileSpec("triton_kernel_mode", b"OFF"),  # Disable Triton to simplify test
+        ]
+        custom_passes = CudaBackend.get_custom_passes(compile_specs)
+
+        # Verify we get CSE and Fusion passes
+        pass_types = [type(p).__name__ for p in custom_passes]
+        self.assertIn("CSEPass", pass_types, "Expected CSEPass in custom passes")
+        self.assertIn(
+            "FuseInt4WeightOnlyQuantMatmulPass",
+            pass_types,
+            "Expected FuseInt4WeightOnlyQuantMatmulPass in custom passes",
+        )
+
+        # Apply passes using the EXACT same logic as AotiBackend.preprocess()
+        # This is the critical code path we're testing (lines 158-168 of aoti_backend.py)
+        device_edge_program = exported
+        for custom_pass in custom_passes:
+            result = custom_pass(device_edge_program.graph_module)
+            # Handle passes that return PassResult with a new graph_module
+            if isinstance(result, PassResult) and result.modified:
+                # Use a permissive verifier that allows all operator types
+                from torch._export.verifier import Verifier
+                from torch._library.custom_ops import CustomOpDef
+                from torch._ops import HigherOrderOperator, OpOverload, OpOverloadPacket
+
+                class _PermissiveVerifier(Verifier):
+                    dialect = "EDGE"
+
+                    def allowed_op_types(self):
+                        return (OpOverload, OpOverloadPacket, HigherOrderOperator, CustomOpDef)
+
+                    def check_valid_op(self, op):
+                        pass  # Allow all ops
+
+                device_edge_program = _update_exported_program_graph_module(
+                    device_edge_program, result.graph_module, override_verifiers=[_PermissiveVerifier]
+                )
+
+        # Verify fusion occurred (3→1)
+        ops_after = self._count_int4mm_ops(device_edge_program.graph_module)
+        self.assertEqual(
+            ops_after,
+            1,
+            f"Expected 1 int4mm operation after fusion via AotiBackend pass application, "
+            f"got {ops_after}. This indicates PassResult handling is broken.",
+        )
+
+        # Verify graph is still valid
+        device_edge_program.graph_module.graph.lint()
+
+    def test_aoti_backend_pass_application_with_triton(self):
+        """
+        Regression test for AotiBackend.preprocess pass application with Triton.
+
+        This test verifies that when custom passes (like ReplaceEdgeOpWithTritonOpPass)
+        introduce operators that the default verifier doesn't recognize (like triton.sdpa),
+        the pass application loop doesn't fail validation.
+
+        The fix was to pass override_verifiers=[] to _update_exported_program_graph_module
+        to skip validation during pass application.
+
+        This test directly verifies that:
+        1. CudaBackend.get_custom_passes() includes ReplaceEdgeOpWithTritonOpPass
+        2. The pass application loop with override_verifiers=[] works correctly
+        3. CSE pass returns PassResult with modified=True and new graph_module
+        """
+        from executorch.backends.cuda.cuda_backend import CudaBackend
+        from executorch.exir.backend.compile_spec_schema import CompileSpec
+
+        # Use a simple model (no SDPA) to test the pass application logic
+        # The key thing we're testing is that override_verifiers=[] works
+        model = self._build_qkv_model(hidden_dim=2048, block_size=128)
+        example_input = (torch.randn(4, 128, 2048, device="cuda", dtype=torch.bfloat16),)
+        exported = export(model, example_input, strict=False)
+
+        # Get custom passes from CudaBackend WITH Triton enabled (default)
+        compile_specs = []  # Default: triton_kernel_mode="ON"
+        custom_passes = CudaBackend.get_custom_passes(compile_specs)
+
+        # Verify we get all three passes including Triton
+        pass_types = [type(p).__name__ for p in custom_passes]
+        self.assertIn("CSEPass", pass_types)
+        self.assertIn("FuseInt4WeightOnlyQuantMatmulPass", pass_types)
+        self.assertIn("ReplaceEdgeOpWithTritonOpPass", pass_types)
+
+        # Track that at least one pass returns PassResult with modified=True
+        # This is the scenario that triggered the original bug
+        modified_count = 0
+        device_edge_program = exported
+
+        for custom_pass in custom_passes:
+            result = custom_pass(device_edge_program.graph_module)
+            if isinstance(result, PassResult) and result.modified:
+                modified_count += 1
+                # This is the critical fix: override_verifiers=[] prevents
+                # SpecViolationError when graph contains unknown operators
+                device_edge_program = _update_exported_program_graph_module(
+                    device_edge_program, result.graph_module, override_verifiers=[]
+                )
+
+        # Verify at least one pass modified the graph
+        self.assertGreater(
+            modified_count,
+            0,
+            "Expected at least one pass to return PassResult with modified=True",
+        )
+
+        # Verify graph is still valid (lint check)
+        device_edge_program.graph_module.graph.lint()
+
+        # Verify fusion occurred (the passes actually did something)
+        ops_after = self._count_int4mm_ops(device_edge_program.graph_module)
+        self.assertEqual(ops_after, 1, "Expected fusion to occur (3→1)")
+
+    def test_fusion_numerical_correctness_with_edge_dialect(self):
+        """
+        Regression test for topological ordering bug in fusion pass.
+
+        This test verifies that fusion produces numerically correct results when
+        applied to a real edge-exported graph (not a manually constructed mock graph).
+
+        The bug was that the fusion pass was inserting fused nodes (cat, fused_mm)
+        at the wrong position in the graph - after placeholder nodes instead of
+        after the input preprocessing nodes. This caused the fused_mm to reference
+        inputs that hadn't been computed yet, breaking the computation.
+
+        The fix was to use `inserting_before(first_int4mm)` instead of
+        `inserting_after(max(weights + scales))` to maintain correct topological order.
+
+        This test catches the bug by:
+        1. Using a real edge-exported graph with proper preprocessing (constant_pad_nd, etc.)
+        2. Properly updating the ExportedProgram after each pass (like aoti_backend.py does)
+        3. Comparing eager output vs fused output numerically
+        """
+        # Skip if torchao not available
+        try:
+            from torchao.quantization import Int4WeightOnlyConfig, quantize_
+        except ImportError:
+            self.skipTest("torchao not available")
+
+        # Skip if no CUDA or SM80+
+        if not torch.cuda.is_available():
+            self.skipTest("CUDA not available")
+        major, _ = torch.cuda.get_device_capability()
+        if major < 8:
+            self.skipTest("Requires SM80+ (A100 or newer)")
+
+        from executorch.exir import EdgeCompileConfig, to_edge
+
+        hidden_size = 256
+        group_size = 128
+
+        class SimpleQKVModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.q_proj = torch.nn.Linear(hidden_size, hidden_size, bias=False)
+                self.k_proj = torch.nn.Linear(hidden_size, hidden_size, bias=False)
+                self.v_proj = torch.nn.Linear(hidden_size, hidden_size, bias=False)
+
+            def forward(self, x):
+                return self.q_proj(x) + self.k_proj(x) + self.v_proj(x)
+
+        # Create and quantize model
+        torch.manual_seed(42)
+        module = SimpleQKVModule().to(dtype=torch.bfloat16, device="cuda").eval()
+
+        int4_config = Int4WeightOnlyConfig(
+            group_size=group_size,
+            int4_packing_format="tile_packed_to_4d",
+        )
+        quantize_(module, int4_config)
+
+        # Create input
+        x = torch.randn(1, 16, hidden_size, dtype=torch.bfloat16, device="cuda")
+
+        # Get eager output (ground truth)
+        with torch.no_grad():
+            eager_output = module(x)
+
+        # Export to edge dialect (this creates the real graph structure with
+        # preprocessing nodes like constant_pad_nd that exposed the bug)
+        exported_program = export(module, (x,), strict=True)
+        edge_program = to_edge(
+            exported_program,
+            compile_config=EdgeCompileConfig(_check_ir_validity=False)
+        )
+
+        # Get the exported program and apply passes using _update_exported_program_graph_module
+        # This mirrors what aoti_backend.py does - passes return new graph_modules that must
+        # be properly integrated back into the ExportedProgram
+        ep = edge_program.exported_program()
+
+        # Apply CSE pass and update the ExportedProgram
+        cse_result = CSEPass()(ep.graph_module)
+        if isinstance(cse_result, PassResult) and cse_result.modified:
+            ep = _update_exported_program_graph_module(
+                ep, cse_result.graph_module, override_verifiers=[]
+            )
+
+        # Count ops before fusion
+        ops_before = self._count_int4mm_ops(ep.graph_module)
+        self.assertEqual(ops_before, 3, "Expected 3 int4mm ops before fusion")
+
+        # Apply fusion pass and update the ExportedProgram
+        fusion_result = FuseInt4WeightOnlyQuantMatmulPass()(ep.graph_module)
+        self.assertTrue(
+            isinstance(fusion_result, PassResult) and fusion_result.modified,
+            "Fusion pass should modify the graph"
+        )
+        ep = _update_exported_program_graph_module(
+            ep, fusion_result.graph_module, override_verifiers=[]
+        )
+
+        # Count ops after fusion
+        ops_after = self._count_int4mm_ops(ep.graph_module)
+        self.assertEqual(ops_after, 1, "Expected 1 int4mm op after fusion")
+
+        # Verify graph is topologically valid (this would have caught the bug)
+        ep.graph_module.graph.lint()
+
+        # Run fused graph and compare output
+        # Now ep.module() returns the FUSED model because we properly updated ep
+        with torch.no_grad():
+            fused_output = ep.module()(x)
+
+        # Verify numerical correctness
+        diff = (eager_output - fused_output).abs()
+        max_diff = diff.max().item()
+
+        self.assertLess(
+            max_diff,
+            1e-3,
+            f"Fused output differs from eager output by {max_diff}. "
+            f"This indicates incorrect graph rewiring during fusion."
+        )
+
+    def test_fusion_split_outputs_contiguous_encoder_pattern(self):
+        """
+        Regression test for non-contiguous tensor_split outputs in encoder pattern.
+
+        BUG: The fusion pass uses tensor_split to divide the fused output back into
+        Q/K/V tensors. tensor_split creates non-contiguous views with incorrect strides:
+        - Expected strides for [batch, seq, hidden]: [seq*hidden, hidden, 1]
+        - Actual strides after split: [seq*3*hidden, 3*hidden, 1]
+
+        This causes issues for encoder patterns with seq_len > 1 because:
+        - Kernels assuming contiguous layout will read wrong memory locations
+        - The stride[1] mismatch (3*hidden vs hidden) causes incorrect indexing
+
+        The decoder (seq_len=1) is unaffected because dim 1 has size 1, making
+        the stride irrelevant. This explains why encoder fails but decoder works.
+
+        NOTE: In eager execution, the outputs may appear contiguous because
+        subsequent view/reshape operations create contiguous copies. However,
+        the FakeTensor metadata (used during AOTI compilation) correctly shows
+        non-contiguous strides. This test checks the FakeTensor metadata to catch
+        the bug that manifests during AOTI compilation.
+
+        This test SHOULD FAIL until the fix is applied (adding .contiguous() after split).
+
+        See: https://github.com/pytorch/executorch/issues/XXXXX
+        """
+        # Skip if torchao not available
+        try:
+            from torchao.quantization import Int4WeightOnlyConfig, quantize_
+        except ImportError:
+            self.skipTest("torchao not available")
+
+        # Skip if no CUDA or SM80+
+        if not torch.cuda.is_available():
+            self.skipTest("CUDA not available")
+        major, _ = torch.cuda.get_device_capability()
+        if major < 8:
+            self.skipTest("Requires SM80+ (A100 or newer)")
+
+        from executorch.exir import EdgeCompileConfig, to_edge
+
+        hidden_size = 256
+        group_size = 128
+        # Use encoder-like seq_len (> 1) to trigger the bug
+        seq_len = 64  # Encoder processes full sequence
+
+        class QKVModule(torch.nn.Module):
+            """Model that returns Q/K/V separately to check their contiguity."""
+            def __init__(self):
+                super().__init__()
+                self.q_proj = torch.nn.Linear(hidden_size, hidden_size, bias=False)
+                self.k_proj = torch.nn.Linear(hidden_size, hidden_size, bias=False)
+                self.v_proj = torch.nn.Linear(hidden_size, hidden_size, bias=False)
+
+            def forward(self, x):
+                q = self.q_proj(x)
+                k = self.k_proj(x)
+                v = self.v_proj(x)
+                return q, k, v
+
+        # Create and quantize model
+        torch.manual_seed(42)
+        module = QKVModule().to(dtype=torch.bfloat16, device="cuda").eval()
+
+        int4_config = Int4WeightOnlyConfig(
+            group_size=group_size,
+            int4_packing_format="tile_packed_to_4d",
+        )
+        quantize_(module, int4_config)
+
+        # Create encoder-like input (seq_len > 1)
+        x = torch.randn(1, seq_len, hidden_size, dtype=torch.bfloat16, device="cuda")
+
+        # Export to edge dialect
+        exported_program = export(module, (x,), strict=True)
+        edge_program = to_edge(
+            exported_program,
+            compile_config=EdgeCompileConfig(_check_ir_validity=False)
+        )
+
+        ep = edge_program.exported_program()
+
+        # Apply CSE pass
+        cse_result = CSEPass()(ep.graph_module)
+        if isinstance(cse_result, PassResult) and cse_result.modified:
+            ep = _update_exported_program_graph_module(
+                ep, cse_result.graph_module, override_verifiers=[]
+            )
+
+        # Verify 3 int4mm ops before fusion
+        ops_before = self._count_int4mm_ops(ep.graph_module)
+        self.assertEqual(ops_before, 3, "Expected 3 int4mm ops before fusion")
+
+        # Apply fusion pass
+        fusion_result = FuseInt4WeightOnlyQuantMatmulPass()(ep.graph_module)
+        self.assertTrue(
+            isinstance(fusion_result, PassResult) and fusion_result.modified,
+            "Fusion pass should modify the graph"
+        )
+        ep = _update_exported_program_graph_module(
+            ep, fusion_result.graph_module, override_verifiers=[]
+        )
+
+        # Verify fusion occurred (3 -> 1)
+        ops_after = self._count_int4mm_ops(ep.graph_module)
+        self.assertEqual(ops_after, 1, "Expected 1 int4mm op after fusion")
+
+        # Check FakeTensor metadata for the replacement nodes (contiguous nodes after getitem)
+        # After the fix, the fusion pass adds .contiguous() after each getitem,
+        # so we should check the contiguous nodes for proper metadata.
+        contiguous_metadata = []
+        for node in ep.graph_module.graph.nodes:
+            if node.op == "call_function" and "contiguous" in str(node.target):
+                if "val" in node.meta:
+                    val = node.meta["val"]
+                    if isinstance(val, torch.Tensor):
+                        contiguous_metadata.append({
+                            "name": node.name,
+                            "shape": tuple(val.shape),
+                            "stride": tuple(val.stride()),
+                            "is_contiguous": val.is_contiguous(),
+                        })
+
+        # After the fix, there should be contiguous nodes with proper metadata
+        self.assertGreater(len(contiguous_metadata), 0, "Expected contiguous nodes after fusion (fix applied)")
+
+        for meta in contiguous_metadata:
+            # Check that FakeTensor metadata shows contiguous tensors
+            self.assertTrue(
+                meta["is_contiguous"],
+                f"FakeTensor for {meta['name']} should be contiguous.\n"
+                f"Shape: {meta['shape']}, Strides: {meta['stride']}"
+            )
+
+    def test_fusion_split_outputs_decoder_pattern_contiguous(self):
+        """
+        Verify decoder pattern (seq_len=1) produces "contiguous" tensors in FakeTensor metadata.
+
+        This test documents why the decoder works while encoder fails:
+        - For seq_len=1, PyTorch considers the tensor contiguous even with
+          stride[1] = 3*hidden because dim 1 has size 1 (stride is irrelevant).
+        - For seq_len > 1 (encoder), the incorrect stride causes is_contiguous=False.
+
+        This test SHOULD PASS (demonstrating the asymmetry between encoder/decoder).
+        """
+        # Skip if torchao not available
+        try:
+            from torchao.quantization import Int4WeightOnlyConfig, quantize_
+        except ImportError:
+            self.skipTest("torchao not available")
+
+        # Skip if no CUDA or SM80+
+        if not torch.cuda.is_available():
+            self.skipTest("CUDA not available")
+        major, _ = torch.cuda.get_device_capability()
+        if major < 8:
+            self.skipTest("Requires SM80+ (A100 or newer)")
+
+        from executorch.exir import EdgeCompileConfig, to_edge
+
+        hidden_size = 256
+        group_size = 128
+        # Use decoder-like seq_len (= 1)
+        seq_len = 1
+
+        class QKVModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.q_proj = torch.nn.Linear(hidden_size, hidden_size, bias=False)
+                self.k_proj = torch.nn.Linear(hidden_size, hidden_size, bias=False)
+                self.v_proj = torch.nn.Linear(hidden_size, hidden_size, bias=False)
+
+            def forward(self, x):
+                q = self.q_proj(x)
+                k = self.k_proj(x)
+                v = self.v_proj(x)
+                return q, k, v
+
+        # Create and quantize model
+        torch.manual_seed(42)
+        module = QKVModule().to(dtype=torch.bfloat16, device="cuda").eval()
+
+        int4_config = Int4WeightOnlyConfig(
+            group_size=group_size,
+            int4_packing_format="tile_packed_to_4d",
+        )
+        quantize_(module, int4_config)
+
+        # Create decoder-like input (seq_len = 1)
+        x = torch.randn(1, seq_len, hidden_size, dtype=torch.bfloat16, device="cuda")
+
+        # Export to edge dialect
+        exported_program = export(module, (x,), strict=True)
+        edge_program = to_edge(
+            exported_program,
+            compile_config=EdgeCompileConfig(_check_ir_validity=False)
+        )
+
+        ep = edge_program.exported_program()
+
+        # Apply CSE + Fusion passes
+        cse_result = CSEPass()(ep.graph_module)
+        if isinstance(cse_result, PassResult) and cse_result.modified:
+            ep = _update_exported_program_graph_module(
+                ep, cse_result.graph_module, override_verifiers=[]
+            )
+
+        fusion_result = FuseInt4WeightOnlyQuantMatmulPass()(ep.graph_module)
+        if isinstance(fusion_result, PassResult) and fusion_result.modified:
+            ep = _update_exported_program_graph_module(
+                ep, fusion_result.graph_module, override_verifiers=[]
+            )
+
+        # Verify fusion occurred
+        ops_after = self._count_int4mm_ops(ep.graph_module)
+        self.assertEqual(ops_after, 1, "Expected 1 int4mm op after fusion")
+
+        # Check FakeTensor metadata for contiguous nodes (after the fix is applied)
+        contiguous_metadata = []
+        for node in ep.graph_module.graph.nodes:
+            if node.op == "call_function" and "contiguous" in str(node.target):
+                if "val" in node.meta:
+                    val = node.meta["val"]
+                    if isinstance(val, torch.Tensor):
+                        contiguous_metadata.append({
+                            "name": node.name,
+                            "shape": tuple(val.shape),
+                            "stride": tuple(val.stride()),
+                            "is_contiguous": val.is_contiguous(),
+                        })
+
+        # After the fix, there should be contiguous nodes
+        self.assertGreater(len(contiguous_metadata), 0, "Expected contiguous nodes after fusion")
+
+        # For seq_len=1, FakeTensor should show contiguous=True
+        for meta in contiguous_metadata:
+            self.assertTrue(
+                meta["is_contiguous"],
+                f"Decoder FakeTensor for {meta['name']} (seq_len=1) should be contiguous.\n"
+                f"Shape: {meta['shape']}, Strides: {meta['stride']}"
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp
index 4f2523989c1..b0f7f139307 100644
--- a/extension/asr/runner/runner.cpp
+++ b/extension/asr/runner/runner.cpp
@@ -16,6 +16,8 @@
 #include <executorch/extension/llm/runner/util.h>
 #include <executorch/extension/llm/sampler/util.h>
 #include <executorch/extension/tensor/tensor_ptr_maker.h>
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/backend/options.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/platform/assert.h>
 #include <executorch/runtime/platform/log.h>
@@ -196,6 +198,18 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
     }
   }
 
+  // Tell CUDA backend to store encoder output as "encoder_output"
+  {
+    ::executorch::runtime::BackendOptions<1> opts;
+    opts.set_option("store_output", "encoder_output");
+    auto err = ::executorch::runtime::set_option("CudaBackend", opts.view());
+    if (err != ::executorch::runtime::Error::Ok) {
+      ET_LOG(
+          Debug,
+          "Failed to set store_output option (backend may not support storage)");
+    }
+  }
+
   auto encoder_result =
       module_->execute(kEncoderMethodName, preprocessed_features);
   ET_CHECK_OK_OR_RETURN_ERROR(encoder_result.error());
@@ -249,6 +263,20 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
   decoder_inputs.emplace_back(decoder_input_ptr);
   decoder_inputs.emplace_back(encoder_output_ptr);
   decoder_inputs.emplace_back(cache_position_ptr);
+
+  // Tell CUDA backend to use stored encoder output for matching decoder inputs.
+  // The backend matches by tensor size, avoiding redundant CPU->GPU copies.
+  {
+    ::executorch::runtime::BackendOptions<1> opts;
+    opts.set_option("use_stored_input", "encoder_output");
+    auto err = ::executorch::runtime::set_option("CudaBackend", opts.view());
+    if (err != ::executorch::runtime::Error::Ok) {
+      ET_LOG(
+          Debug,
+          "Failed to set use_stored_input option (backend may not support storage)");
+    }
+  }
+
   // Add some green coloring for the first generated token
   // token_callback("\033[1;32m");
   while (generated_tokens < config.max_new_tokens) {
@@ -304,6 +332,20 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
       break;
     }
   }
+
+  // Reset stored input settings and free GPU memory after decoder loop
+  // completes. This disables the D2D copy optimization and releases the stored
+  // encoder output.
+  {
+    ::executorch::runtime::BackendOptions<2> opts;
+    opts.set_option("reset_stored_input", true);
+    opts.set_option("clear_stored_tensor", "encoder_output");
+    auto err = ::executorch::runtime::set_option("CudaBackend", opts.view());
+    if (err != ::executorch::runtime::Error::Ok) {
+      ET_LOG(Error, "Failed to reset stored input settings");
+    }
+  }
+
   // Reset coloring
   // token_callback("\033[0m");
   // Update stats and print report