Add custom mask to gather/scatter

gunnersdeng · gunnersdeng · commit 7bad87895b5b · 2026-02-02T16:55:37.000-08:00
Signed-off-by: Ziheng Deng &lt;zihengd@nvidia.com&gt;
diff --git a/changelog.d/gather-scatter-mask.md b/changelog.d/gather-scatter-mask.md
@@ -0,0 +1,4 @@
+<!--- SPDX-FileCopyrightText: Copyright (c) <2026> NVIDIA CORPORATION & AFFILIATES. All rights reserved. -->
+<!--- SPDX-License-Identifier: Apache-2.0 -->
+
+- Added optional `mask` parameter to `ct.gather()` and `ct.scatter()` for custom boolean masking.
diff --git a/src/cuda/tile/_ir/ops.py b/src/cuda/tile/_ir/ops.py
@@ -2340,9 +2340,9 @@ def pointer_offset(pointer: Var, offset: Var) -> Var:
 
 
 @impl(ct.gather)
-def gather_impl(array: Var, indices: Var, padding_value: Var, check_bounds: Var,
-                latency: Var) -> Var:
-    pointer, mask = _gather_scatter_pointer_and_mask(array, indices, check_bounds)
+def gather_impl(array: Var, indices: Var, mask: Var, padding_value: Var,
+                check_bounds: Var, latency: Var) -> Var:
+    pointer, final_mask = _gather_scatter_pointer_and_mask(array, indices, check_bounds, mask)
     pointer_ty = pointer.get_type()
     pointer_shape = pointer_ty.shape_value if isinstance(pointer_ty, TileTy) else ()
 
@@ -2360,12 +2360,13 @@ def gather_impl(array: Var, indices: Var, padding_value: Var, check_bounds: Var,
     # Handle the latency hint
     latency = require_optional_constant_int(latency)
     check_load_store_hints(latency)
-    return load_pointer(pointer, mask, padding_value, latency)
+    return load_pointer(pointer, final_mask, padding_value, latency)
 
 
 @impl(ct.scatter)
-def scatter_impl(array: Var, indices: Var, value: Var, check_bounds: Var, latency: Var):
-    pointer, mask = _gather_scatter_pointer_and_mask(array, indices, check_bounds)
+def scatter_impl(array: Var, indices: Var, value: Var, mask: Var,
+                 check_bounds: Var, latency: Var):
+    pointer, final_mask = _gather_scatter_pointer_and_mask(array, indices, check_bounds, mask)
     pointer_ty = pointer.get_type()
     pointer_shape = pointer_ty.shape_value if isinstance(pointer_ty, TileTy) else ()
 
@@ -2377,7 +2378,7 @@ def scatter_impl(array: Var, indices: Var, value: Var, check_bounds: Var, latenc
     latency = require_optional_constant_int(latency)
     check_load_store_hints(latency)
 
-    store_pointer(pointer, value, mask, latency)
+    store_pointer(pointer, value, final_mask, latency)
 
 
 def _get_scatter_value(value: Var, pointer_shape: Tuple[int, ...], array_dtype: DType,
@@ -2395,9 +2396,52 @@ def _get_scatter_value(value: Var, pointer_shape: Tuple[int, ...], array_dtype:
     return broadcast_to(value, pointer_shape)
 
 
-def _gather_scatter_pointer_and_mask(array: Var,
-                                     indices: Var,
-                                     check_bounds: Var) -> Tuple[Var, Optional[Var]]:
+def _process_custom_mask(mask: Optional[Var], bounds_mask: Optional[Var],
+                         pointer_shape: Tuple[int, ...]) -> Optional[Var]:
+    """
+    Process and validate the custom mask parameter for gather/scatter operations.
+
+    Args:
+        mask: The user-provided mask (can be Python None or Var containing None)
+        bounds_mask: The generated bounds-checking mask based on indices (or None)
+        pointer_shape: The target shape that the mask should be broadcast to
+
+    Returns:
+        The final mask to use (custom AND bounds, or just one of them, or None)
+    """
+    # Check if mask is None (either Python None or Var containing None)
+    if mask is None or (mask.is_constant() and mask.get_constant() is None):
+        # No custom mask provided, return the bounds mask
+        return bounds_mask
+
+    # Validate the mask type
+    mask_ty = require_tile_or_scalar_type(mask)
+    mask_dtype = get_dtype(mask_ty)
+
+    if not is_boolean(mask_dtype):
+        raise TileTypeError(f"Custom mask must have boolean dtype, but got {mask_dtype}")
+
+    # Check that mask shape is broadcastable
+    mask_shape = mask_ty.shape_value if isinstance(mask_ty, TileTy) else ()
+    if not is_shape_broadcastable_to(mask_shape, pointer_shape):
+        raise TileTypeError(f"Custom mask shape {mask_shape} is not broadcastable"
+                            f" to the index shape {pointer_shape}")
+
+    # Broadcast the mask to the pointer shape
+    mask = broadcast_to(mask, pointer_shape)
+
+    # Combine with bounds mask if both exist
+    if bounds_mask is None:
+        return mask
+    else:
+        return binary_bitwise("and_", bounds_mask, mask)
+
+
+def _gather_scatter_pointer_and_mask(
+        array: Var,
+        indices: Var,
+        check_bounds: Var,
+        custom_mask: Optional[Var] = None) -> Tuple[Var, Optional[Var]]:
     check_bounds = require_constant_bool(check_bounds)
     array_ty = require_array_type(array)
     indices_ty = require_index_or_index_tuple_type(indices,
@@ -2475,10 +2519,15 @@ def _gather_scatter_pointer_and_mask(array: Var,
     # Offset the base pointer
     if offset is None:
         # 0-D array case
-        return array_val.base_ptr, None
+        pointer = array_val.base_ptr
+        pointer_shape = ()
     else:
         pointer = pointer_offset(array_val.base_ptr, offset)
-        return pointer, mask
+        pointer_shape = common_shape
+
+    # Process custom mask and combine with bounds mask
+    final_mask = _process_custom_mask(custom_mask, mask, pointer_shape)
+    return pointer, final_mask
 
 
 @memory_effect(MemoryEffect.STORE)
diff --git a/src/cuda/tile/_stub.py b/src/cuda/tile/_stub.py
@@ -625,7 +625,8 @@ def store(array: Array, /,
 
 
 @function
-def gather(array, indices, /, *, padding_value=0, check_bounds=True, latency=None) -> Tile:
+def gather(array, indices, /, *, mask=None, padding_value=0, check_bounds=True,
+           latency=None) -> Tile:
     """
     Loads a tile from the `array` elements specified by `indices`.
 
@@ -651,10 +652,19 @@ def gather(array, indices, /, *, padding_value=0, check_bounds=True, latency=Non
 
         >>> ct.gather(array, ind0)   # equivalent to ct.gather(array, (ind0,))
 
+    A custom boolean `mask` can be provided to control which elements are loaded.
+    The mask must be a scalar or a tile whose shape is broadcastable to the common shape
+    of indices. Where the mask is ``False``, `padding_value` is returned instead of loading
+    from the array.
+
     `gather()` checks that indices are within the bounds of the array. For indices
     that are out of bounds, `padding_value` will be returned (zero by default).
     It must be a scalar or a tile whose shape is broadcastable to the common shape of indices.
 
+    If both `mask` and `check_bounds=True` are provided, the effective mask is the logical
+    AND of both the custom mask and the bounds-checking mask. This means an element is only
+    loaded if both the custom mask is ``True`` AND the index is within bounds.
+
     To disable bounds checking, set `check_bounds` to ``False``.
     In this mode, the caller is responsible for ensuring that all indices are within the bounds
     of the array, and any out-of-bounds access will result in undefined behavior.
@@ -665,7 +675,7 @@ def gather(array, indices, /, *, padding_value=0, check_bounds=True, latency=Non
 
 
 @function
-def scatter(array, indices, value, /, *, check_bounds=True, latency=None):
+def scatter(array, indices, value, /, *, mask=None, check_bounds=True, latency=None):
     """
     Stores a tile `value` into the `array` elements specified by `indices`.
 
@@ -692,11 +702,20 @@ def scatter(array, indices, value, /, *, check_bounds=True, latency=None):
 
         >>> ct.scatter(array, ind0, value)   # equivalent to ct.scatter(array, (ind0,), value)
 
+    A custom boolean `mask` can be provided to control which elements are stored.
+    The mask must be a scalar or a tile whose shape is broadcastable to the common shape
+    of indices. Where the mask is ``False``, no store occurs.
+
     `scatter()` checks that indices are within the bounds of the array. For indices
-    that are out of bounds, nothing is stored. To disable bounds checking,
-    set `check_bounds` to ``False``. In this mode, the caller is responsible for ensuring that
-    all indices are within the bounds of the array, and any out-of-bounds access
-    will result in undefined behavior.
+    that are out of bounds, nothing is stored.
+
+    If both `mask` and `check_bounds=True` are provided, the effective mask is the logical
+    AND of both the custom mask and the bounds-checking mask. This means an element is only
+    stored if both the custom mask is ``True`` AND the index is within bounds.
+
+    To disable bounds checking, set `check_bounds` to ``False``. In this mode, the caller
+    is responsible for ensuring that all indices are within the bounds of the array, and
+    any out-of-bounds access will result in undefined behavior.
     """
 
 
diff --git a/test/test_gather_scatter.py b/test/test_gather_scatter.py
@@ -211,3 +211,198 @@ def test_ir_checked_vs_unchecked(kernel, expected_mask):
     store_ops = [op for op in root_block.traverse() if isinstance(op, StorePointerTokenOrdered)]
     assert len(store_ops) == 1
     assert (store_ops[0].mask is not None) == expected_mask
+
+
+# ============================================================================
+# Tests for custom mask parameter
+# ============================================================================
+
+@ct.kernel
+def gather_with_custom_mask_1d(x, y, mask_array):
+    """Test gather with custom boolean mask."""
+    indices = ct.arange(8, dtype=ct.int32)
+    # Load mask from array
+    mask_tile = ct.gather(mask_array, indices)
+    # Gather with custom mask, no bounds checking needed
+    tx = ct.gather(x, indices, mask=mask_tile, padding_value=-999.0, check_bounds=False)
+    ct.scatter(y, indices, tx)
+
+
+def test_gather_with_custom_mask_1d():
+    """Test gather with a custom mask that selectively loads elements."""
+    x = torch.arange(8, dtype=torch.float32, device="cuda")
+    y = torch.zeros(8, dtype=torch.float32, device="cuda")
+    # Create a mask: load only even indices
+    mask = torch.tensor([True, False, True, False, True, False, True, False],
+                        dtype=torch.bool, device="cuda")
+
+    ct.launch(torch.cuda.current_stream(), (1,), gather_with_custom_mask_1d, (x, y, mask))
+
+    # Expected: even indices get their values, odd indices get padding value -999.0
+    expected = torch.tensor([0.0, -999.0, 2.0, -999.0, 4.0, -999.0, 6.0, -999.0],
+                            device="cuda")
+    assert_equal(expected, y)
+
+
+@ct.kernel
+def gather_with_mask_and_bounds_check(x, y, indices_array, mask_array):
+    """Test gather with both custom mask and bounds checking."""
+    idx = ct.arange(8, dtype=ct.int32)
+    ind = ct.gather(indices_array, idx)
+    mask_tile = ct.gather(mask_array, idx)
+    # Both custom mask AND bounds checking
+    tx = ct.gather(x, ind, mask=mask_tile, padding_value=-1.0, check_bounds=True)
+    ct.scatter(y, idx, tx)
+
+
+def test_gather_with_mask_and_bounds_check():
+    """Test that custom mask AND bounds checking are combined correctly."""
+    x = torch.arange(10, dtype=torch.float32, device="cuda")  # array size 10
+    y = torch.zeros(8, dtype=torch.float32, device="cuda")
+    # Mix of valid indices, out-of-bounds indices, and masked indices
+    # 15, 20 are OOB
+    indices = torch.tensor([0, 1, 15, 3, 4, 20, 6, 7], dtype=torch.int32,
+                           device="cuda")
+    mask = torch.tensor([True, True, True, False, True, True, False, True],
+                        dtype=torch.bool, device="cuda")
+
+    ct.launch(torch.cuda.current_stream(), (1,),
+              gather_with_mask_and_bounds_check, (x, y, indices, mask))
+
+    # Expected behavior:
+    # idx 0: mask=True, in-bounds (0<10) → load x[0]=0.0
+    # idx 1: mask=True, in-bounds (1<10) → load x[1]=1.0
+    # idx 2: mask=True, OOB (15>=10) → padding -1.0
+    # idx 3: mask=False, in-bounds → padding -1.0
+    # idx 4: mask=True, in-bounds (4<10) → load x[4]=4.0
+    # idx 5: mask=True, OOB (20>=10) → padding -1.0
+    # idx 6: mask=False, in-bounds → padding -1.0
+    # idx 7: mask=True, in-bounds (7<10) → load x[7]=7.0
+    expected = torch.tensor([0.0, 1.0, -1.0, -1.0, 4.0, -1.0, -1.0, 7.0], device="cuda")
+    assert_equal(expected, y)
+
+
+@ct.kernel
+def scatter_with_custom_mask(x, y, mask_array):
+    """Test scatter with custom mask."""
+    indices = ct.arange(8, dtype=ct.int32)
+    mask_tile = ct.gather(mask_array, indices)
+    values = ct.gather(x, indices)
+    # Scatter with custom mask
+    ct.scatter(y, indices, values, mask=mask_tile, check_bounds=False)
+
+
+def test_scatter_with_custom_mask():
+    """Test scatter with a custom mask that selectively stores elements."""
+    # [100, 101, ..., 107]
+    x = torch.arange(100, 108, dtype=torch.float32, device="cuda")
+    y = torch.zeros(8, dtype=torch.float32, device="cuda")
+    # Create a mask: store only at indices 0, 2, 4, 6
+    mask = torch.tensor([True, False, True, False, True, False, True, False],
+                        dtype=torch.bool, device="cuda")
+
+    ct.launch(torch.cuda.current_stream(), (1,), scatter_with_custom_mask, (x, y, mask))
+
+    # Expected: only masked positions are written
+    expected = torch.tensor([100.0, 0.0, 102.0, 0.0, 104.0, 0.0, 106.0, 0.0], device="cuda")
+    assert_equal(expected, y)
+
+
+@ct.kernel
+def gather_2d_with_broadcast_mask(x, y, mask_array):
+    """Test gather with 2D indices and broadcasted mask."""
+    # Create 2D indices that broadcast
+    ind0 = ct.arange(4, dtype=ct.int32)[:, None]  # shape (4, 1)
+    ind1 = ct.arange(4, dtype=ct.int32)  # shape (4,)
+    # Load mask - it's already (4, 1) shaped
+    mask_tile = ct.gather(mask_array, (ct.arange(4, dtype=ct.int32)[:, None], 0))
+    # Gather with broadcasted mask: mask (4,1) broadcasts to (4,4)
+    t = ct.gather(x, (ind0, ind1), mask=mask_tile, padding_value=0.0, check_bounds=False)
+    # Flatten and store result
+    ct.scatter(y, ct.arange(16, dtype=ct.int32), ct.reshape(t, (16,)))
+
+
+def test_gather_2d_with_broadcast_mask():
+    """Test that mask broadcasting works correctly with 2D indices."""
+    x = torch.arange(16, dtype=torch.float32, device="cuda").reshape(4, 4)
+    y = torch.zeros(16, dtype=torch.float32, device="cuda")
+    # Mask shape (4, 1) - prepared outside kernel
+    mask = torch.tensor([[True], [False], [True], [False]], dtype=torch.bool,
+                        device="cuda")
+
+    ct.launch(torch.cuda.current_stream(), (1,), gather_2d_with_broadcast_mask, (x, y, mask))
+
+    # ind0 (4,1): [[0], [1], [2], [3]]
+    # ind1 (4,): [0, 1, 2, 3]
+    # Broadcast to (4,4):
+    #   ind0: [[0,0,0,0], [1,1,1,1], [2,2,2,2], [3,3,3,3]]
+    #   ind1: [[0,1,2,3], [0,1,2,3], [0,1,2,3], [0,1,2,3]]
+    # Mask (4,1) broadcasts to (4,4):
+    #   [[T,T,T,T], [F,F,F,F], [T,T,T,T], [F,F,F,F]]
+    # Expected gathered values (flattened):
+    #   Row 0 (mask=True): x[0,0], x[0,1], x[0,2], x[0,3] = [0, 1, 2, 3]
+    #   Row 1 (mask=False): [0, 0, 0, 0]
+    #   Row 2 (mask=True): x[2,0], x[2,1], x[2,2], x[2,3] = [8, 9, 10, 11]
+    #   Row 3 (mask=False): [0, 0, 0, 0]
+    expected = torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 8, 9, 10, 11, 0, 0, 0, 0],
+                            dtype=torch.float32, device="cuda")
+    assert_equal(expected, y)
+
+
+@ct.kernel
+def gather_with_scalar_mask(x, y, mask_val: ct.Constant[bool]):
+    """Test gather with scalar mask."""
+    indices = ct.arange(8, dtype=ct.int32)
+    tx = ct.gather(x, indices, mask=mask_val, padding_value=-1.0, check_bounds=False)
+    ct.scatter(y, indices, tx)
+
+
+@pytest.mark.parametrize("mask_val", [True, False])
+def test_gather_with_scalar_mask(mask_val):
+    """Test that scalar masks work correctly."""
+    x = torch.arange(8, dtype=torch.float32, device="cuda")
+    y = torch.zeros(8, dtype=torch.float32, device="cuda")
+
+    ct.launch(torch.cuda.current_stream(), (1,), gather_with_scalar_mask, (x, y, mask_val))
+
+    if mask_val:
+        # mask=True: all elements should be loaded
+        expected = x
+    else:
+        # mask=False: all elements should be padding value
+        expected = torch.full_like(x, -1.0)
+
+    assert_equal(expected, y)
+
+
+def test_mask_type_error():
+    """Test that providing non-boolean mask raises TileTypeError."""
+    @ct.kernel
+    def gather_with_int_mask(x, y):
+        indices = ct.arange(8, dtype=ct.int32)
+        mask = ct.arange(8, dtype=ct.int32)  # Wrong: integer mask instead of boolean
+        tx = ct.gather(x, indices, mask=mask, check_bounds=False)
+        ct.scatter(y, indices, tx)
+
+    x = torch.arange(8, dtype=torch.float32, device="cuda")
+    y = torch.zeros(8, dtype=torch.float32, device="cuda")
+
+    with pytest.raises(TileTypeError, match="boolean"):
+        ct.launch(torch.cuda.current_stream(), (1,), gather_with_int_mask, (x, y))
+
+
+def test_mask_shape_error():
+    """Test that incompatible mask shape raises TileTypeError."""
+    @ct.kernel
+    def gather_with_wrong_shape_mask(x, y):
+        indices = ct.arange(8, dtype=ct.int32)
+        # Create mask with wrong shape: (4,) not broadcastable to (8,)
+        mask_tile = ct.arange(4, dtype=ct.int32) > 0  # shape (4,), bool
+        tx = ct.gather(x, indices, mask=mask_tile, check_bounds=False)
+        ct.scatter(y, indices, tx)
+
+    x = torch.arange(8, dtype=torch.float32, device="cuda")
+    y = torch.zeros(8, dtype=torch.float32, device="cuda")
+
+    with pytest.raises(TileTypeError, match="not broadcastable"):
+        ct.launch(torch.cuda.current_stream(), (1,), gather_with_wrong_shape_mask, (x, y))