From a8aa9e03ac20ce1670be81793c65b617128e3207 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 20 Feb 2025 17:01:04 +0100
Subject: [PATCH 001/208] WIP

---
 xlb/compute_backend.py                        |   1 +
 xlb/default_config.py                         |  18 ++
 xlb/grid/grid.py                              |  15 +-
 xlb/grid/neon_grid.py                         | 135 +++++++++++++++
 xlb/helper/initializers.py                    |   6 +
 xlb/helper/nse_solver.py                      |   4 +-
 .../boundary_condition/bc_equilibrium.py      |  30 +++-
 .../bc_halfway_bounce_back.py                 |  32 ++++
 .../indices_boundary_masker.py                | 128 +++++++++++++++
 .../equilibrium/quadratic_equilibrium.py      |  80 +++++++++
 xlb/operator/macroscopic/macroscopic.py       |  65 ++++++++
 xlb/operator/macroscopic/zero_moment.py       |  32 ++++
 xlb/operator/operator.py                      |  18 +-
 xlb/operator/stepper/nse_stepper.py           | 155 +++++++++++++++++-
 xlb/operator/stream/stream.py                 |  52 ++++++
 xlb/velocity_set/velocity_set.py              |   5 +
 16 files changed, 769 insertions(+), 7 deletions(-)
 create mode 100644 xlb/grid/neon_grid.py

diff --git a/xlb/compute_backend.py b/xlb/compute_backend.py
index 60da2912..6b4ed702 100644
--- a/xlb/compute_backend.py
+++ b/xlb/compute_backend.py
@@ -6,3 +6,4 @@
 class ComputeBackend(Enum):
     JAX = auto()
     WARP = auto()
+    NEON = auto()
diff --git a/xlb/default_config.py b/xlb/default_config.py
index 3823353c..1a2aef65 100644
--- a/xlb/default_config.py
+++ b/xlb/default_config.py
@@ -20,6 +20,24 @@ def init(velocity_set, default_backend, default_precision_policy):
         import warp as wp
 
         wp.init()  # TODO: Must be removed in the future versions of WARP
+    elif default_backend == ComputeBackend.NEON:
+        import warp as wp
+        import neon
+
+        wp.config.mode = "release"
+        wp.config.llvm_cuda = False
+        wp.config.verbose = True
+        wp.verbose_warnings = True
+
+        wp.init()
+
+
+        # It's a good idea to always clear the kernel cache when developing new native or codegen features
+        wp.build.clear_kernel_cache()
+
+        # !!! DO THIS BEFORE DEFINING/USING ANY KERNELS WITH CUSTOM TYPES
+        neon.init()
+
     elif default_backend == ComputeBackend.JAX:
         check_backend_support()
     else:
diff --git a/xlb/grid/grid.py b/xlb/grid/grid.py
index 53139fc1..1f785362 100644
--- a/xlb/grid/grid.py
+++ b/xlb/grid/grid.py
@@ -6,12 +6,20 @@
 from xlb.compute_backend import ComputeBackend
 
 
-def grid_factory(shape: Tuple[int, ...], compute_backend: ComputeBackend = None):
+def grid_factory(shape: Tuple[int, ...],
+                 compute_backend: ComputeBackend = None,
+                 velocity_set=None,
+                 ):
     compute_backend = compute_backend or DefaultConfig.default_backend
     if compute_backend == ComputeBackend.WARP:
         from xlb.grid.warp_grid import WarpGrid
 
         return WarpGrid(shape)
+    elif compute_backend == ComputeBackend.NEON:
+        from xlb.grid.neon_grid import NeonGrid
+
+        return NeonGrid(shape=shape,
+                        velocity_set=velocity_set)
     elif compute_backend == ComputeBackend.JAX:
         from xlb.grid.jax_grid import JaxGrid
 
@@ -21,7 +29,10 @@ def grid_factory(shape: Tuple[int, ...], compute_backend: ComputeBackend = None)
 
 
 class Grid(ABC):
-    def __init__(self, shape: Tuple[int, ...], compute_backend: ComputeBackend):
+    def __init__(self,
+                 shape: Tuple[int, ...],
+                 compute_backend: ComputeBackend,
+                 ):
         self.shape = shape
         self.dim = len(shape)
         self.compute_backend = compute_backend
diff --git a/xlb/grid/neon_grid.py b/xlb/grid/neon_grid.py
new file mode 100644
index 00000000..b81baac6
--- /dev/null
+++ b/xlb/grid/neon_grid.py
@@ -0,0 +1,135 @@
+import warp as wp
+from cryptography.hazmat.backends.openssl.backend import backend
+
+import neon
+from .grid import Grid
+from xlb.precision_policy import Precision
+from xlb.compute_backend import ComputeBackend
+from typing import Literal
+from xlb import DefaultConfig
+
+class NeonGrid(Grid):
+    def __init__(self, shape, velocity_set):
+        from .warp_grid import WarpGrid
+
+        self.bk = None
+        self.dim = None
+        self.grid = None
+        self.xlb_lattice = velocity_set
+        self.warp_grid = WarpGrid(shape)
+
+        super().__init__(shape, ComputeBackend.NEON)
+
+
+    def _get_velocity_set(self):
+        return self.xlb_lattice
+
+    def _initialize_backend(self):
+        # do nothing
+        pass
+
+    def _initialize_backend(self):
+        import neon
+
+        # FIXME@max: for now we hardcode the number of devices to 0
+        num_devs = 1
+        dev_idx_list = list(range(num_devs))
+
+        if len(self.shape) == 2:
+            import py_neon
+            self.dim = py_neon.Index_3d(self.shape[0],
+                                        1,
+                                        self.shape[1])
+            self.neon_stencil = []
+            for c_idx in range(len(self.xlb_lattice._c[0])):
+                xval = self.xlb_lattice._c[0][c_idx]
+                yval = self.xlb_lattice._c[1][c_idx]
+                self.neon_stencil.append([xval, 0, yval])
+
+        else:
+            self.dim = neon.Index_3d(self.shape[0],
+                                        self.shape[1],
+                                        self.shape[2])
+
+            self.neon_stencil = []
+            for c_idx in range(len(self.xlb_lattice._c[0])):
+                xval = self.xlb_lattice._c[0][c_idx]
+                yval = self.xlb_lattice._c[1][c_idx]
+                zval = self.xlb_lattice._c[2][c_idx]
+                self.neon_stencil.append([xval, yval, zval])
+
+        self.bk = neon.Backend(
+            runtime=neon.Backend.Runtime.stream,
+            dev_idx_list=dev_idx_list)
+
+        self.grid = neon.dense.dGrid(
+            backend=self.bk,
+            dim=self.dim,
+            sparsity=None,
+            stencil=self.neon_stencil)
+        pass
+
+    def create_field(
+            self,
+            cardinality: int,
+            dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None,
+            fill_value=None,
+    ):
+        dtype = dtype.wp_dtype if dtype else DefaultConfig.default_precision_policy.store_precision.wp_dtype
+        field = self.grid.new_field(cardinality=cardinality,
+                                    dtype=dtype, )
+
+        if fill_value is None:
+            neon.Container.zero(field).run(0)
+        else:
+            neon.Container.fill(field, fill_value).run(0)
+        return field
+
+    def _create_warp_field(self,
+                           cardinality: int,
+                           dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None,
+                           fill_value=None,
+                           ne_field=None
+    ):
+        warp_field = self.warp_grid.create_field(cardinality, dtype, fill_value)
+        if ne_field is None:
+            return warp_field
+
+        _d = self.xlb_lattice.d
+
+        import neon, typing
+        @neon.Container.factory
+        def container(
+                src_field: typing.Any,
+                dst_field: typing.Any,
+                cardinality: wp.int32
+        ):
+            def loading_step(loader: neon.Loader):
+                loader.declare_execution_scope(self.grid)
+                src_pn = loader.get_read_handel(src_field)
+
+                @wp.func
+                def cloning(gridIdx: typing.Any):
+                    cIdx = wp.neon_global_idx(src_pn, gridIdx)
+                    gx = wp.neon_get_x(cIdx)
+                    gy = wp.neon_get_y(cIdx)
+                    gz = wp.neon_get_z(cIdx)
+
+                    # TODO@Max - XLB is flattening the z dimension in 3D, while neon uses the y dimension
+                    if _d == 2:
+                        gy, gz = gz, gy
+
+                    for card in range(cardinality):
+                        value = wp.neon_read(src_pn,
+                                      gridIdx,
+                                      card)
+                        dst_field[card, gx, gy, gz] = value
+
+                loader.declare_kernel(cloning)
+
+            return loading_step
+
+        c = container(src_field=ne_field, dst_field=warp_field, cardinality=cardinality)
+        c.run(0)
+        wp.synchronize()
+        return warp_field
diff --git a/xlb/helper/initializers.py b/xlb/helper/initializers.py
index ccb4a82f..40f7f9a4 100644
--- a/xlb/helper/initializers.py
+++ b/xlb/helper/initializers.py
@@ -15,6 +15,12 @@ def initialize_eq(f, grid, velocity_set, precision_policy, backend, rho=None, u=
     elif backend == ComputeBackend.WARP:
         f = equilibrium(rho, u, f)
 
+    elif backend == ComputeBackend.NEON:
+        f = equilibrium(rho, u, f)
+        pass
+    else:
+        raise NotImplementedError(f"Backend {backend} not implemented")
+
     del rho, u
 
     return f
diff --git a/xlb/helper/nse_solver.py b/xlb/helper/nse_solver.py
index 361dc6e3..c56eb07c 100644
--- a/xlb/helper/nse_solver.py
+++ b/xlb/helper/nse_solver.py
@@ -30,12 +30,12 @@ def create_nse_fields(
     if grid is None:
         if grid_shape is None:
             raise ValueError("grid_shape must be provided when grid is None")
-        grid = grid_factory(grid_shape, compute_backend=compute_backend)
+        grid = grid_factory(grid_shape, compute_backend=compute_backend, velocity_set=velocity_set)
 
     # Create fields
     f_0 = grid.create_field(cardinality=velocity_set.q, dtype=precision_policy.store_precision)
     f_1 = grid.create_field(cardinality=velocity_set.q, dtype=precision_policy.store_precision)
-    missing_mask = grid.create_field(cardinality=velocity_set.q, dtype=Precision.BOOL)
+    missing_mask = grid.create_field(cardinality=velocity_set.q, dtype=Precision.UINT8)
     bc_mask = grid.create_field(cardinality=1, dtype=Precision.UINT8)
 
     return grid, f_0, f_1, missing_mask, bc_mask
diff --git a/xlb/operator/boundary_condition/bc_equilibrium.py b/xlb/operator/boundary_condition/bc_equilibrium.py
index 85cfd653..fbfb2449 100644
--- a/xlb/operator/boundary_condition/bc_equilibrium.py
+++ b/xlb/operator/boundary_condition/bc_equilibrium.py
@@ -90,8 +90,36 @@ def functional(
 
         return functional, kernel
 
+    def _construct_neon(self):
+        # Set local constants TODO: This is a hack and should be fixed with warp update
+        _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
+        _rho = self.compute_dtype(self.rho)
+        _u = _u_vec(self.u[0], self.u[1], self.u[2]) if self.velocity_set.d == 3 else _u_vec(self.u[0], self.u[1])
+
+        # Construct the functional for this BC
+        @wp.func
+        def functional(
+            index: Any,
+            timestep: Any,
+            missing_mask: Any,
+            f_0: Any,
+            f_1: Any,
+            f_pre: Any,
+            f_post: Any,
+        ):
+            # we can use directly the warp_functional method from the equilibrium operator
+            # the Neon implementation is the same as the Warp implementation as all the computation
+            # is done at the register level
+            _f = self.equilibrium_operator.neon_functional(_rho, _u)
+            return _f
+
+        # Use the parent class's kernel and pass the functional
+        kernel = None
+
+        return functional, kernel
+
     @Operator.register_backend(ComputeBackend.WARP)
-    def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
+    def warp_launch(self, f_pre, f_post, bc_mask, missing_mask):
         # Launch the warp kernel
         wp.launch(
             self.warp_kernel,
diff --git a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
index 8ede0c8b..aaa565aa 100644
--- a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
@@ -97,3 +97,35 @@ def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
             dim=f_pre.shape[1:],
         )
         return f_post
+
+    def _construct_neon(self):
+        # Set local constants
+        _opp_indices = self.velocity_set.opp_indices
+
+        # Construct the functional for this BC
+        @wp.func
+        def functional(
+            index: Any,
+            timestep: Any,
+            missing_mask: Any,
+            f_0: Any,
+            f_1: Any,
+            f_pre: Any,
+            f_post: Any,
+        ):
+            # Post-streaming values are only modified at missing direction
+            _f = f_post
+            for l in range(self.velocity_set.q):
+                # If the mask is missing then take the opposite index
+                if missing_mask[l] == wp.uint8(1):
+                    # Get the pre-streaming distribution function in oppisite direction
+                    _f[l] = f_pre[_opp_indices[l]]
+
+            return _f
+
+        return functional, None
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(self, f_pre, f_post, bc_mask, missing_mask):
+        # rise exception as this feature is not implemented yet
+        raise NotImplementedError("This feature is not implemented in NEON yet.")
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 3c5ea867..0b302866 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -218,3 +218,131 @@ def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
         )
 
         return bc_mask, missing_mask
+
+    def _construct_neon(self):
+        # All the computation is done at the register step
+        return None, None
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None, grid=None):
+        # Pre-allocate arrays with maximum possible size
+        velocity_set = grid._get_velocity_set()
+        missing_mask_warp = grid._create_warp_field(cardinality=velocity_set.q, dtype=Precision.BOOL)
+        bc_mask_warp = grid._create_warp_field(cardinality=1, dtype=Precision.UINT8)
+        _, warp_kernel = self._construct_warp()
+
+        max_size = sum(len(bc.indices[0]) if isinstance(bc.indices, list) else bc.indices.shape[1] for bc in bclist if bc.indices is not None)
+        indices = np.zeros((3, max_size), dtype=np.int32)
+        id_numbers = np.zeros(max_size, dtype=np.uint8)
+        is_interior = np.zeros(max_size, dtype=bool)
+
+        current_index = 0
+        for bc in bclist:
+            assert bc.indices is not None, f'Please specify indices associated with the {bc.__class__.__name__} BC using keyword "indices"!'
+            assert bc.mesh_vertices is None, f"Please use MeshBoundaryMasker operator if {bc.__class__.__name__} is imposed on a mesh (e.g. STL)!"
+
+            bc_indices = np.asarray(bc.indices)
+            num_indices = bc_indices.shape[1]
+
+            # Ensure indices are 3D
+            if bc_indices.shape[0] == 2:
+                bc_indices = np.vstack([bc_indices, np.zeros(num_indices, dtype=int)])
+
+            # Add indices to the pre-allocated array
+            indices[:, current_index : current_index + num_indices] = bc_indices
+
+            # Set id numbers
+            id_numbers[current_index : current_index + num_indices] = bc.id
+
+            # Set is_interior flags
+            if bc.needs_padding:
+                is_interior[current_index : current_index + num_indices] = self.are_indices_in_interior(bc_indices, bc_mask_warp[0].shape)
+            else:
+                is_interior[current_index : current_index + num_indices] = False
+
+            current_index += num_indices
+
+            # Remove indices from BC objects
+            bc.__dict__.pop("indices", None)
+
+        # Trim arrays to actual size
+        indices = indices[:, :current_index]
+        id_numbers = id_numbers[:current_index]
+        is_interior = is_interior[:current_index]
+
+        # Convert to Warp arrays
+        wp_indices = wp.array(indices, dtype=wp.int32)
+        wp_id_numbers = wp.array(id_numbers, dtype=wp.uint8)
+        wp_is_interior = wp.array(is_interior, dtype=wp.bool)
+
+        if start_index is None:
+            start_index = wp.vec3i(0, 0, 0)
+        else:
+            start_index = wp.vec3i(*start_index)
+
+        # Launch the warp kernel
+        wp.launch(
+            warp_kernel,
+            dim=current_index,
+            inputs=[
+                wp_indices,
+                wp_id_numbers,
+                wp_is_interior,
+                bc_mask_warp,
+                missing_mask_warp,
+                start_index,
+            ],
+        )
+        wp.synchronize()
+
+        import wpne, typing
+        @wpne.Container.factory
+        def container(
+                bc_mask_warp: typing.Any,
+                missing_mask_warp: typing.Any,
+                bc_mask_field: typing.Any,
+                missing_mask_field: typing.Any,
+        ):
+            def loading_step(loader: wpne.Loader):
+                loader.declare_execution_scope(bc_mask.get_grid())
+
+                bc_mask_hdl = loader.get_read_handel(bc_mask_field)
+                missing_mask_hdl = loader.get_read_handel(missing_mask_field)
+
+                @wp.func
+                def masker(gridIdx: typing.Any):
+                    cIdx = wp.neon_global_idx(bc_mask_hdl, gridIdx)
+                    gx = wp.neon_get_x(cIdx)
+                    gy = wp.neon_get_y(cIdx)
+                    gz = wp.neon_get_z(cIdx)
+                    # TODO@Max - XLB is flattening the z dimension in 3D, while neon uses the y dimension
+                    local_mask = bc_mask_warp[
+                        0,
+                        gx,
+                        gz,
+                        gy]
+                    wp.neon_write(bc_mask_hdl, gridIdx, 0, local_mask)
+
+                    for q in range(self.velocity_set.q):
+                        is_missing = wp.uint8( missing_mask_warp[
+                            q,
+                            wp.neon_get_x(cIdx),
+                            wp.neon_get_z(cIdx),
+                            wp.neon_get_y(cIdx)])
+                        wp.neon_write(missing_mask_hdl,
+                                      gridIdx,
+                                      q,
+                                      is_missing)
+
+                loader.declare_kernel(masker)
+
+            return loading_step
+
+        c = container(bc_mask_warp, missing_mask_warp, bc_mask, missing_mask)
+        c.run(0)
+        wp.synchronize()
+
+        del bc_mask_warp
+        del missing_mask_warp
+
+        return bc_mask, missing_mask
diff --git a/xlb/operator/equilibrium/quadratic_equilibrium.py b/xlb/operator/equilibrium/quadratic_equilibrium.py
index 62cc0414..31dbfed0 100644
--- a/xlb/operator/equilibrium/quadratic_equilibrium.py
+++ b/xlb/operator/equilibrium/quadratic_equilibrium.py
@@ -2,6 +2,12 @@
 import jax.numpy as jnp
 from jax import jit
 import warp as wp
+import os
+
+# Print the PYTHONPATH
+pythonpath = os.environ.get('PYTHONPATH', 'PYTHONPATH is not set')
+print(f"PYTHONPATH: {pythonpath}")
+import neon
 from typing import Any
 
 from xlb.compute_backend import ComputeBackend
@@ -96,3 +102,77 @@ def warp_implementation(self, rho, u, f):
             dim=rho.shape[1:],
         )
         return f
+
+
+    def _construct_neon(self):
+        import neon
+        # Set local constants TODO: This is a hack and should be fixed with warp update
+        _c = self.velocity_set.c
+        _w = self.velocity_set.w
+        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
+        _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
+
+        # Construct the equilibrium functional
+        @wp.func
+        def functional(
+            rho: Any,
+            u: Any,
+        ):
+            # Allocate the equilibrium
+            feq = _f_vec()
+
+            # Compute the equilibrium
+            for l in range(self.velocity_set.q):
+                # Compute cu
+                cu = self.compute_dtype(0.0)
+                for d in range(self.velocity_set.d):
+                    if _c[d, l] == 1:
+                        cu += u[d]
+                    elif _c[d, l] == -1:
+                        cu -= u[d]
+                cu *= self.compute_dtype(3.0)
+
+                # Compute usqr
+                usqr = self.compute_dtype(1.5) * wp.dot(u, u)
+
+                # Compute feq
+                feq[l] = rho * _w[l] * (self.compute_dtype(1.0) + cu * (self.compute_dtype(1.0) + self.compute_dtype(0.5) * cu) - usqr)
+
+            return feq
+
+        import neon, typing
+        @neon.Container.factory(name="QuadraticEquilibrium")
+        def container(
+            rho: Any,
+            u: Any,
+            f: Any,
+        ):
+
+            def quadratic_equilibrium_ll(loader:neon.Loader):
+                loader.set_grid(rho.get_grid())
+                rho_pn=loader.get_read_handle(rho)
+                u_pn =loader.get_read_handle(u)
+                f_pn=loader.get_write_handle(f)
+
+                @wp.func
+                def quadratic_equilibrium_cl(index: typing.Any):
+                    _u = _u_vec()
+                    for d in range(self.velocity_set.d):
+                        _u[d] = wp.neon_read(u_pn, index, d)
+                    _rho = wp.neon_read(rho_pn, index, 0)
+                    feq = functional(_rho, _u)
+
+                    # Set the output
+                    for l in range(self.velocity_set.q):
+                        #wp.neon_write(f_pn, index, l, self.store_dtype(feq[l]))
+                        wp.neon_write(f_pn, index, l, feq[l])
+                loader.declare_kernel(quadratic_equilibrium_cl)
+            return quadratic_equilibrium_ll
+        return functional, container
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(self, rho, u, f):
+        c = self.neon_container( rho, u, f)
+        c.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
+
+        return f
diff --git a/xlb/operator/macroscopic/macroscopic.py b/xlb/operator/macroscopic/macroscopic.py
index ab1193b0..c7fd2c13 100644
--- a/xlb/operator/macroscopic/macroscopic.py
+++ b/xlb/operator/macroscopic/macroscopic.py
@@ -64,3 +64,68 @@ def warp_implementation(self, f, rho, u):
             dim=rho.shape[1:],
         )
         return rho, u
+
+    def _construct_neon(self):
+        zero_moment_func = self.zero_moment.neon_functional
+        first_moment_func = self.first_moment.neon_functional
+        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
+
+        @wp.func
+        def functional(f: _f_vec):
+            rho = zero_moment_func(f)
+            u = first_moment_func(f, rho)
+            return rho, u
+
+        @wp.kernel
+        def kernel(
+            f: wp.array4d(dtype=Any),
+            rho: wp.array4d(dtype=Any),
+            u: wp.array4d(dtype=Any),
+        ):
+            i, j, k = wp.tid()
+            index = wp.vec3i(i, j, k)
+
+            _f = _f_vec()
+            for l in range(self.velocity_set.q):
+                _f[l] = f[l, index[0], index[1], index[2]]
+            _rho, _u = functional(_f)
+
+            rho[0, index[0], index[1], index[2]] = self.store_dtype(_rho)
+            for d in range(self.velocity_set.d):
+                u[d, index[0], index[1], index[2]] = self.store_dtype(_u[d])
+
+        import neon, typing
+        @neon.Container.factory
+        def container(
+                f_field: Any,
+                rho_field: Any,
+                u_fild: Any,
+        ):
+            _d = self.velocity_set.d
+            def macroscopic_ll(loader: neon.Loader):
+                loader.declare_execution_scope(f_field.get_grid())
+
+                rho=loader.get_read_handel(rho_field)
+                u =loader.get_read_handel(u_fild)
+                f=loader.get_read_handel(f_field)
+
+                @wp.func
+                def macroscopic_cl(gIdx: typing.Any):
+                    _f = _f_vec()
+                    for l in range(self.velocity_set.q):
+                        _f[l] = wp.neon_read(f, gIdx,l)
+                    _rho, _u = functional(_f)
+                    wp.neon_write(rho, gIdx, 0, _rho)
+                    for d in range(_d):
+                        wp.neon_write(u, gIdx, d, _u[d])
+
+                loader.declare_kernel(macroscopic_cl)
+            return macroscopic_ll
+        return functional, container
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(self, f, rho, u):
+        c = self.neon_container(f, rho, u)
+        c.run(0)
+        wp.synchronize()
+        return rho, u
diff --git a/xlb/operator/macroscopic/zero_moment.py b/xlb/operator/macroscopic/zero_moment.py
index 8abb4de7..23ff2e7a 100644
--- a/xlb/operator/macroscopic/zero_moment.py
+++ b/xlb/operator/macroscopic/zero_moment.py
@@ -47,3 +47,35 @@ def kernel(
     def warp_implementation(self, f, rho):
         wp.launch(self.warp_kernel, inputs=[f, rho], dim=rho.shape[1:])
         return rho
+
+    def _construct_neon(self):
+        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
+
+        @wp.func
+        def functional(f: _f_vec):
+            rho = self.compute_dtype(0.0)
+            for l in range(self.velocity_set.q):
+                rho += f[l]
+            return rho
+        #
+        # @wp.kernel
+        # def kernel(
+        #     f: wp.array4d(dtype=Any),
+        #     rho: wp.array4d(dtype=Any),
+        # ):
+        #     i, j, k = wp.tid()
+        #     index = wp.vec3i(i, j, k)
+        #
+        #     _f = _f_vec()
+        #     for l in range(self.velocity_set.q):
+        #         _f[l] = f[l, index[0], index[1], index[2]]
+        #     _rho = functional(_f)
+        #
+        #     rho[0, index[0], index[1], index[2]] = _rho
+
+        return functional, None
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(self, f, rho):
+        # rise exception as this feature is not implemented yet
+        raise NotImplementedError("This feature is not implemented in NEON yet.")
diff --git a/xlb/operator/operator.py b/xlb/operator/operator.py
index 6e8bbbbb..7b187f24 100644
--- a/xlb/operator/operator.py
+++ b/xlb/operator/operator.py
@@ -30,6 +30,9 @@ def __init__(self, velocity_set=None, precision_policy=None, compute_backend=Non
         if self.compute_backend == ComputeBackend.WARP:
             self.warp_functional, self.warp_kernel = self._construct_warp()
 
+        if self.compute_backend == ComputeBackend.NEON:
+            self.neon_functional, self.neon_container = self._construct_neon()
+
         # Updating JAX config in case fp64 is requested
         if self.compute_backend == ComputeBackend.JAX and (
             precision_policy == PrecisionPolicy.FP64FP64 or precision_policy == PrecisionPolicy.FP64FP32
@@ -70,7 +73,9 @@ def __call__(self, *args, callback=None, **kwargs):
                 error = e
                 traceback_str = traceback.format_exc()
                 continue  # This skips to the next candidate if binding fails
-
+        method_candidates = [
+            (key, method) for key, method in self._backends.items() if key[1] == self.compute_backend
+        ]
         raise Exception(f"Error captured for backend with key {key} for operator {self.__class__.__name__}: {error}\n {traceback_str}")
 
     @property
@@ -113,6 +118,8 @@ def compute_dtype(self):
             return self.precision_policy.compute_precision.jax_dtype
         elif self.compute_backend == ComputeBackend.WARP:
             return self.precision_policy.compute_precision.wp_dtype
+        elif self.compute_backend == ComputeBackend.NEON:
+            return self.precision_policy.compute_precision.wp_dtype
 
     @property
     def store_dtype(self):
@@ -132,3 +139,12 @@ def _construct_warp(self):
         Leave it for now, as it is not clear how the warp backend will evolve
         """
         return None, None
+
+    def _construct_neon(self):
+        """
+        Construct the Neon functional and Neon container of the operator
+        TODO: Maybe a better way to do this?
+        Maybe add this to the backend decorator?
+        Leave it for now, as it is not clear how the neon backend will evolve
+        """
+        return None, None
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 7326891a..21799a65 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -3,6 +3,7 @@
 from functools import partial
 from jax import jit
 import warp as wp
+import neon
 from typing import Any
 
 from xlb import DefaultConfig
@@ -79,8 +80,10 @@ def prepare_fields(self, initializer=None):
         # Copy f_0 using backend-specific copy to f_1
         if self.compute_backend == ComputeBackend.JAX:
             f_1 = f_0.copy()
-        else:
+        if self.compute_backend == ComputeBackend.WARP:
             wp.copy(f_1, f_0)
+        if self.compute_backend == ComputeBackend.NEON:
+            neon.Container.copy(f_1, f_0).run(0)
 
         # Process boundary conditions and update masks
         bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, bc_mask, missing_mask)
@@ -99,6 +102,7 @@ def _process_boundary_conditions(cls, boundary_conditions, bc_mask, missing_mask
             velocity_set=DefaultConfig.velocity_set,
             precision_policy=DefaultConfig.default_precision_policy,
             compute_backend=DefaultConfig.default_backend,
+            grid=cls
         )
         # Split boundary conditions by type
         bc_with_vertices = [bc for bc in boundary_conditions if bc.mesh_vertices is not None]
@@ -322,3 +326,152 @@ def warp_implementation(self, f_0, f_1, bc_mask, missing_mask, omega, timestep):
             dim=f_0.shape[1:],
         )
         return f_0, f_1
+
+    def _construct_neon(self):
+        # Set local constants
+        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
+        _missing_mask_vec = wp.vec(self.velocity_set.q, dtype=wp.uint8)
+        _opp_indices = self.velocity_set.opp_indices
+        #_cast_to_store_dtype = self.store_dtype()
+
+        # Read the list of bc_to_id created upon instantiation
+        bc_to_id = boundary_condition_registry.bc_to_id
+        id_to_bc = boundary_condition_registry.id_to_bc
+
+        # Gather IDs of ExtrapolationOutflowBC boundary conditions
+        extrapolation_outflow_bc_ids = []
+        for bc_name, bc_id in bc_to_id.items():
+            if bc_name.startswith("ExtrapolationOutflowBC"):
+                extrapolation_outflow_bc_ids.append(bc_id)
+        # Group active boundary conditions
+        active_bcs = set(boundary_condition_registry.id_to_bc[bc.id] for bc in self.boundary_conditions)
+
+        @wp.func
+        def apply_bc(
+            index: Any,
+            timestep: Any,
+            _boundary_id: Any,
+            missing_mask: Any,
+            f_0: Any,
+            f_1: Any,
+            f_pre: Any,
+            f_post: Any,
+            is_post_streaming: bool,
+        ):
+            f_result = f_post
+
+            # Unroll the loop over boundary conditions
+            for i in range(wp.static(len(self.boundary_conditions))):
+                if is_post_streaming:
+                    if wp.static(self.boundary_conditions[i].implementation_step == ImplementationStep.STREAMING):
+                        if _boundary_id == wp.static(self.boundary_conditions[i].id):
+                            f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep, missing_mask, f_0, f_1, f_pre, f_post)
+                else:
+                    if wp.static(self.boundary_conditions[i].implementation_step == ImplementationStep.COLLISION):
+                        if _boundary_id == wp.static(self.boundary_conditions[i].id):
+                            f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep, missing_mask, f_0, f_1, f_pre, f_post)
+                    if wp.static(self.boundary_conditions[i].id in extrapolation_outflow_bc_ids):
+                        if _boundary_id == wp.static(self.boundary_conditions[i].id):
+                            f_result = wp.static(self.boundary_conditions[i].prepare_bc_auxilary_data)(
+                                index, timestep, missing_mask, f_0, f_1, f_pre, f_post
+                            )
+            return f_result
+
+        @wp.func
+        def neon_get_thread_data(
+            f0_pn: Any,
+            f1_pn: Any,
+            missing_mask_pn: Any,
+            index: Any,
+        ):
+            # Read thread data for populations
+            _f0_thread = _f_vec()
+            _f1_thread = _f_vec()
+            _missing_mask = _missing_mask_vec()
+            for l in range(self.velocity_set.q):
+                # q-sized vector of pre-streaming populations
+                _f0_thread[l] = self.compute_dtype(wp.neon_read(f0_pn, index, l))
+                _f1_thread[l] = self.compute_dtype(wp.neon_read(f1_pn, index, l))
+                _missing_mask[l] = wp.neon_read(missing_mask_pn, index, l)
+
+            return _f0_thread, _f1_thread, _missing_mask
+
+        import neon, typing
+        @neon.Container.factory(name="nse_stepper")
+        def container(
+                f_0_fd: Any,
+                f_1_fd: Any,
+                bc_mask_fd: Any,
+                missing_mask_fd: Any,
+                timestep: int,
+        ):
+            cast_to_store_dtype = self.store_dtype
+            def nse_stepper_ll(loader: neon.Loader):
+                loader.declare_execution_scope(bc_mask_fd.get_grid())
+                f_0_pn=loader.get_read_handel(f_0_fd)
+                f_1_pn =loader.get_read_handel(f_1_fd)
+                bc_mask_pn=loader.get_read_handel(bc_mask_fd)
+                missing_mask_pn=loader.get_read_handel(missing_mask_fd)
+
+                @wp.func
+                def nse_stepper_cl(index: typing.Any):
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    if _boundary_id == wp.uint8(255):
+                        return
+
+                    # Apply streaming
+                    _f_post_stream = self.stream.neon_functional(f_0_pn, index)
+
+                    _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
+                    _f_post_collision = _f0_thread
+
+                    # Apply post-streaming boundary conditions
+                    _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
+
+                    _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
+                    _feq = self.equilibrium.neon_functional(_rho, _u)
+                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u)
+
+                    # Apply post-collision boundary conditions
+                    _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False)
+
+                    # Store the result in f_1
+                    for l in range(self.velocity_set.q):
+                        # TODO: Improve this later
+                        if wp.static("GradsApproximationBC" in active_bcs):
+                            if _boundary_id == wp.static(boundary_condition_registry.bc_to_id["GradsApproximationBC"]):
+                                if _missing_mask[l] == wp.uint8(1):
+                                    wp.neon_write(f_0_pn, index, _opp_indices[l], _f1_thread[_opp_indices[l]])
+                        wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+                loader.declare_kernel(nse_stepper_cl)
+            return nse_stepper_ll
+
+        return None, container
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_launch(self, f_0, f_1, bc_mask, missing_mask, timestep):
+        #if self.c is None:
+        #    self.c = self.neon_container(f_0, f_1, bc_mask, missing_mask, timestep)
+        import wpne
+        is_odd = timestep%2 == 1
+        is_even = not is_odd
+
+        c = None
+        if self.odd_or_even == 'even':
+            c = self.c_even
+        else:
+            c = self.c_odd
+
+        if c is None:
+            c = self.neon_container(f_0, f_1, bc_mask, missing_mask, timestep)
+        c.run(0, container_runtime=wpne.Container.ContainerRuntime.neon)
+
+        if self.odd_or_even == 'even':
+            self.c_even = c
+        else:
+            self.c_odd = c
+
+        if self.odd_or_even == 'even':
+            self.odd_or_even = 'odd'
+
+        return f_0, f_1
diff --git a/xlb/operator/stream/stream.py b/xlb/operator/stream/stream.py
index 247fa5a9..5a1b99ca 100644
--- a/xlb/operator/stream/stream.py
+++ b/xlb/operator/stream/stream.py
@@ -112,3 +112,55 @@ def warp_implementation(self, f_0, f_1):
             dim=f_0.shape[1:],
         )
         return f_1
+
+    def _construct_neon(self):
+        # Set local constants TODO: This is a hack and should be fixed with warp update
+        _c = self.velocity_set.c
+        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
+
+        # Construct the funcional to get streamed indices
+        @wp.func
+        def functional(
+            f:Any,
+            index: Any,
+        ):
+            # Pull the distribution function
+            _f = _f_vec()
+            for l in range(self.velocity_set.q):
+                # Get pull index
+                # pull_index = type(index)()
+                # for d in range(self.velocity_set.d):
+                #     pull_index[d] = index[d] - _c[d, l]
+
+                ngh = wp.neon_ngh_idx(wp.int8(-_c[0, l]),
+                                      wp.int8(0),
+                                      wp.int8(-_c[1, l]))
+
+                unused_is_valid = wp.bool(False)
+                _f[l] = wp.neon_ngh_data(f, index, ngh, l, self.compute_dtype(0), unused_is_valid)
+
+            return _f
+
+        # # Construct the warp kernel
+        # @wp.kernel
+        # def kernel(
+        #     f_0: wp.array4d(dtype=Any),
+        #     f_1: wp.array4d(dtype=Any),
+        # ):
+        #     # Get the global index
+        #     i, j, k = wp.tid()
+        #     index = wp.vec3i(i, j, k)
+        #
+        #     # Set the output
+        #     _f = functional(f_0, index)
+        #
+        #     # Write the output
+        #     for l in range(self.velocity_set.q):
+        #         f_1[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
+
+        return functional, None
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(self, f_0, f_1):
+        # rise exception as this feature is not implemented yet
+        raise NotImplementedError("This feature is not implemented in NEON yet.")
diff --git a/xlb/velocity_set/velocity_set.py b/xlb/velocity_set/velocity_set.py
index 33b2331b..2837531c 100644
--- a/xlb/velocity_set/velocity_set.py
+++ b/xlb/velocity_set/velocity_set.py
@@ -44,6 +44,8 @@ def __init__(self, d, q, c, w, precision_policy, backend):
         # Convert properties to backend-specific format
         if self.backend == ComputeBackend.WARP:
             self._init_warp_properties()
+        elif self.backend == ComputeBackend.NEON:
+            self._init_neon_properties()
         elif self.backend == ComputeBackend.JAX:
             self._init_jax_properties()
         else:
@@ -85,6 +87,9 @@ def _init_warp_properties(self):
         self.c_float = wp.constant(wp.mat((self.d, self.q), dtype=dtype)(self._c_float))
         self.qi = wp.constant(wp.mat((self.q, self.d * (self.d + 1) // 2), dtype=dtype)(self._qi))
 
+    def _init_neon_properties(self):
+        self._init_warp_properties()
+
     def _init_jax_properties(self):
         """
         Convert NumPy properties to JAX-specific properties.

From 49ec99e45bff68633fa2ddda7be2df00dc0f1c15 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Tue, 25 Feb 2025 11:32:28 +0100
Subject: [PATCH 002/208] WIP

---
 examples/performance/mlups_3d_neon.py         | 126 ++++++++++++++++++
 xlb/grid/neon_grid.py                         |   8 +-
 .../indices_boundary_masker.py                |  23 ++--
 xlb/operator/macroscopic/first_moment.py      |   9 ++
 xlb/operator/macroscopic/macroscopic.py       |  31 +----
 xlb/operator/macroscopic/zero_moment.py       |  25 +---
 xlb/operator/stepper/nse_stepper.py           |  35 ++---
 7 files changed, 172 insertions(+), 85 deletions(-)
 create mode 100644 examples/performance/mlups_3d_neon.py

diff --git a/examples/performance/mlups_3d_neon.py b/examples/performance/mlups_3d_neon.py
new file mode 100644
index 00000000..bfbbc8e0
--- /dev/null
+++ b/examples/performance/mlups_3d_neon.py
@@ -0,0 +1,126 @@
+from warp.examples.fem.example_convection_diffusion import velocity
+
+import xlb
+import argparse
+import time
+import warp as wp
+import numpy as np
+
+# add a directory to the PYTHON PATH
+import sys
+sys.path.append('/home/max/repos/neon/warping/neon_warp_testing/neon_py_bindings/py/')
+import neon
+
+from xlb.compute_backend import ComputeBackend
+from xlb.precision_policy import PrecisionPolicy
+from xlb.grid import grid_factory
+from xlb.operator.stepper import IncompressibleNavierStokesStepper
+from xlb.operator.boundary_condition import FullwayBounceBackBC, EquilibriumBC
+from xlb.distribute import distribute
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation (BGK)")
+    # Positional arguments
+    parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid")
+    parser.add_argument("num_steps", type=int, help="Timestep for the simulation")
+    parser.add_argument("backend", type=str, help="Backend for the simulation (jax, warp or neon)")
+    parser.add_argument("precision", type=str, help="Precision for the simulation (e.g., fp32/fp32)")
+
+    # Optional arguments
+    parser.add_argument("--num_devices", type=int, default=0, help="Number of devices for the simulation (default: 0)")
+    parser.add_argument("--velocity_set", type=str, default='D3Q19',
+                        help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)"
+                        )
+
+    return parser.parse_args()
+
+
+def setup_simulation(args):
+    backend = None
+    if args.backend == "jax": backend = ComputeBackend.JAX
+    elif args.backend == "warp": backend = ComputeBackend.WARP
+    elif args.backend == "neon": backend = ComputeBackend.NEON
+    if backend is None:
+        raise ValueError("Invalid backend")
+
+    precision_policy_map = {
+        "fp32/fp32": PrecisionPolicy.FP32FP32,
+        "fp64/fp64": PrecisionPolicy.FP64FP64,
+        "fp64/fp32": PrecisionPolicy.FP64FP32,
+        "fp32/fp16": PrecisionPolicy.FP32FP16,
+    }
+    precision_policy = precision_policy_map.get(args.precision)
+    if precision_policy is None:
+        raise ValueError("Invalid precision")
+
+    velocity_set = None
+    if args.velocity_set == 'D3Q19': velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
+    elif args.velocity_set == 'D3Q27': velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, backend=backend)
+    if velocity_set is None:
+        raise ValueError("Invalid velocity set")
+
+    xlb.init(
+        velocity_set=velocity_set,
+        default_backend=backend,
+        default_precision_policy=precision_policy,
+    )
+
+    return backend, precision_policy
+
+
+def run(backend, precision_policy, grid_shape, num_steps):
+    # Create grid and setup boundary conditions
+    velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
+    grid = grid_factory(grid_shape, velocity_set=velocity_set)
+    box = grid.bounding_box_indices()
+    box_no_edge = grid.bounding_box_indices(remove_edges=True)
+    lid = box_no_edge["top"]
+    walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in range(len(grid.shape))]
+    walls = np.unique(np.array(walls), axis=-1).tolist()
+
+    boundary_conditions = [EquilibriumBC(rho=1.0, u=(0.02, 0.0, 0.0), indices=lid), FullwayBounceBackBC(indices=walls)]
+
+    # Create stepper
+    stepper = IncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="BGK")
+
+    # Distribute if using JAX backend
+    if backend == ComputeBackend.JAX:
+        stepper = distribute(
+            stepper,
+            grid,
+            velocity_set,
+        )
+
+    # Initialize fields and run simulation
+    omega = 1.0
+    f_0, f_1, bc_mask, missing_mask = stepper.prepare_fields()
+    start_time = time.time()
+
+    for i in range(num_steps):
+        f_0, f_1 = stepper(f_0, f_1, bc_mask, missing_mask, omega, i)
+        f_0, f_1 = f_1, f_0
+    wp.synchronize()
+
+    return time.time() - start_time
+
+
+def calculate_mlups(cube_edge, num_steps, elapsed_time):
+    total_lattice_updates = cube_edge**3 * num_steps
+    mlups = (total_lattice_updates / elapsed_time) / 1e6
+    return mlups
+
+
+def main():
+    args = parse_arguments()
+    backend, precision_policy = setup_simulation(args)
+    grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
+    elapsed_time = run(backend, precision_policy, grid_shape, args.num_steps)
+    mlups = calculate_mlups(args.cube_edge, args.num_steps, elapsed_time)
+
+    print(f"Simulation completed in {elapsed_time:.2f} seconds")
+    print(f"MLUPs: {mlups:.2f}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/xlb/grid/neon_grid.py b/xlb/grid/neon_grid.py
index b81baac6..2f74598a 100644
--- a/xlb/grid/neon_grid.py
+++ b/xlb/grid/neon_grid.py
@@ -20,7 +20,6 @@ def __init__(self, shape, velocity_set):
 
         super().__init__(shape, ComputeBackend.NEON)
 
-
     def _get_velocity_set(self):
         return self.xlb_lattice
 
@@ -29,7 +28,6 @@ def _initialize_backend(self):
         pass
 
     def _initialize_backend(self):
-        import neon
 
         # FIXME@max: for now we hardcode the number of devices to 0
         num_devs = 1
@@ -80,9 +78,9 @@ def create_field(
                                     dtype=dtype, )
 
         if fill_value is None:
-            neon.Container.zero(field).run(0)
+            field.zero_run(stream_idx = 0)
         else:
-            neon.Container.fill(field, fill_value).run(0)
+            field.fill_run(value=fill_value,stream_idx = 0)
         return field
 
     def _create_warp_field(self,
@@ -97,7 +95,7 @@ def _create_warp_field(self,
 
         _d = self.xlb_lattice.d
 
-        import neon, typing
+        import typing
         @neon.Container.factory
         def container(
                 src_field: typing.Any,
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 0b302866..9d38615f 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -224,11 +224,11 @@ def _construct_neon(self):
         return None, None
 
     @Operator.register_backend(ComputeBackend.NEON)
-    def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None, grid=None):
+    def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None, xlb_grid=None):
         # Pre-allocate arrays with maximum possible size
-        velocity_set = grid._get_velocity_set()
-        missing_mask_warp = grid._create_warp_field(cardinality=velocity_set.q, dtype=Precision.BOOL)
-        bc_mask_warp = grid._create_warp_field(cardinality=1, dtype=Precision.UINT8)
+        velocity_set = xlb_grid._get_velocity_set()
+        missing_mask_warp = xlb_grid._create_warp_field(cardinality=velocity_set.q, dtype=Precision.BOOL)
+        bc_mask_warp = xlb_grid._create_warp_field(cardinality=1, dtype=Precision.UINT8)
         _, warp_kernel = self._construct_warp()
 
         max_size = sum(len(bc.indices[0]) if isinstance(bc.indices, list) else bc.indices.shape[1] for bc in bclist if bc.indices is not None)
@@ -290,24 +290,23 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None, g
                 wp_is_interior,
                 bc_mask_warp,
                 missing_mask_warp,
-                start_index,
             ],
         )
         wp.synchronize()
 
-        import wpne, typing
-        @wpne.Container.factory
+        import neon, typing
+        @neon.Container.factory("")
         def container(
                 bc_mask_warp: typing.Any,
                 missing_mask_warp: typing.Any,
                 bc_mask_field: typing.Any,
                 missing_mask_field: typing.Any,
         ):
-            def loading_step(loader: wpne.Loader):
-                loader.declare_execution_scope(bc_mask.get_grid())
+            def loading_step(loader: neon.Loader):
+                loader.set_grid(bc_mask.get_grid())
 
-                bc_mask_hdl = loader.get_read_handel(bc_mask_field)
-                missing_mask_hdl = loader.get_read_handel(missing_mask_field)
+                bc_mask_hdl = loader.get_read_handle(bc_mask_field)
+                missing_mask_hdl = loader.get_read_handle(missing_mask_field)
 
                 @wp.func
                 def masker(gridIdx: typing.Any):
@@ -315,7 +314,7 @@ def masker(gridIdx: typing.Any):
                     gx = wp.neon_get_x(cIdx)
                     gy = wp.neon_get_y(cIdx)
                     gz = wp.neon_get_z(cIdx)
-                    # TODO@Max - XLB is flattening the z dimension in 3D, while neon uses the y dimension
+                    # TODO@Max - XLB is flattening the y dimension in 3D, while neon uses the z dimension
                     local_mask = bc_mask_warp[
                         0,
                         gx,
diff --git a/xlb/operator/macroscopic/first_moment.py b/xlb/operator/macroscopic/first_moment.py
index cb99a9ff..5555f812 100644
--- a/xlb/operator/macroscopic/first_moment.py
+++ b/xlb/operator/macroscopic/first_moment.py
@@ -65,3 +65,12 @@ def warp_implementation(self, f, rho, u):
             dim=u.shape[1:],
         )
         return u
+
+    def _construct_neon(self):
+        functional,_  = self._construct_warp()
+        return functional, None
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(self, f, rho):
+        # rise exception as this feature is not implemented yet
+        raise NotImplementedError("This feature is not implemented in NEON yet.")
\ No newline at end of file
diff --git a/xlb/operator/macroscopic/macroscopic.py b/xlb/operator/macroscopic/macroscopic.py
index c7fd2c13..5b033d9a 100644
--- a/xlb/operator/macroscopic/macroscopic.py
+++ b/xlb/operator/macroscopic/macroscopic.py
@@ -26,8 +26,8 @@ def jax_implementation(self, f):
         return rho, u
 
     def _construct_warp(self):
-        zero_moment_func = self.zero_moment.warp_functional
-        first_moment_func = self.first_moment.warp_functional
+        zero_moment_func = self.zero_moment.neon_functional
+        first_moment_func = self.first_moment.neon_functional
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
 
         @wp.func
@@ -66,34 +66,9 @@ def warp_implementation(self, f, rho, u):
         return rho, u
 
     def _construct_neon(self):
-        zero_moment_func = self.zero_moment.neon_functional
-        first_moment_func = self.first_moment.neon_functional
+        functional, _ = self._construct_warp()
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
 
-        @wp.func
-        def functional(f: _f_vec):
-            rho = zero_moment_func(f)
-            u = first_moment_func(f, rho)
-            return rho, u
-
-        @wp.kernel
-        def kernel(
-            f: wp.array4d(dtype=Any),
-            rho: wp.array4d(dtype=Any),
-            u: wp.array4d(dtype=Any),
-        ):
-            i, j, k = wp.tid()
-            index = wp.vec3i(i, j, k)
-
-            _f = _f_vec()
-            for l in range(self.velocity_set.q):
-                _f[l] = f[l, index[0], index[1], index[2]]
-            _rho, _u = functional(_f)
-
-            rho[0, index[0], index[1], index[2]] = self.store_dtype(_rho)
-            for d in range(self.velocity_set.d):
-                u[d, index[0], index[1], index[2]] = self.store_dtype(_u[d])
-
         import neon, typing
         @neon.Container.factory
         def container(
diff --git a/xlb/operator/macroscopic/zero_moment.py b/xlb/operator/macroscopic/zero_moment.py
index 23ff2e7a..13d69b62 100644
--- a/xlb/operator/macroscopic/zero_moment.py
+++ b/xlb/operator/macroscopic/zero_moment.py
@@ -49,30 +49,7 @@ def warp_implementation(self, f, rho):
         return rho
 
     def _construct_neon(self):
-        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
-
-        @wp.func
-        def functional(f: _f_vec):
-            rho = self.compute_dtype(0.0)
-            for l in range(self.velocity_set.q):
-                rho += f[l]
-            return rho
-        #
-        # @wp.kernel
-        # def kernel(
-        #     f: wp.array4d(dtype=Any),
-        #     rho: wp.array4d(dtype=Any),
-        # ):
-        #     i, j, k = wp.tid()
-        #     index = wp.vec3i(i, j, k)
-        #
-        #     _f = _f_vec()
-        #     for l in range(self.velocity_set.q):
-        #         _f[l] = f[l, index[0], index[1], index[2]]
-        #     _rho = functional(_f)
-        #
-        #     rho[0, index[0], index[1], index[2]] = _rho
-
+        functional,_  = self._construct_warp()
         return functional, None
 
     @Operator.register_backend(ComputeBackend.NEON)
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 21799a65..891f18ab 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -33,6 +33,10 @@ def __init__(
         force_vector=None,
     ):
         super().__init__(grid, boundary_conditions)
+        self.odd_or_even='even'
+        self.c_even = None
+        self.c_odd = None
+
 
         # Construct the collision operator
         if collision_type == "BGK":
@@ -83,17 +87,17 @@ def prepare_fields(self, initializer=None):
         if self.compute_backend == ComputeBackend.WARP:
             wp.copy(f_1, f_0)
         if self.compute_backend == ComputeBackend.NEON:
-            neon.Container.copy(f_1, f_0).run(0)
+            f_1.copy_from_run(f_0, 0)
 
         # Process boundary conditions and update masks
-        bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, bc_mask, missing_mask)
+        bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, bc_mask, missing_mask, xlb_grid=self.grid)
         # Initialize auxiliary data if needed
         f_0, f_1 = self._initialize_auxiliary_data(self.boundary_conditions, f_0, f_1, bc_mask, missing_mask)
 
         return f_0, f_1, bc_mask, missing_mask
 
     @classmethod
-    def _process_boundary_conditions(cls, boundary_conditions, bc_mask, missing_mask):
+    def _process_boundary_conditions(cls, boundary_conditions, bc_mask, missing_mask, xlb_grid=None):
         """Process boundary conditions and update boundary masks."""
         # Check for boundary condition overlaps
         check_bc_overlaps(boundary_conditions, DefaultConfig.velocity_set.d, DefaultConfig.default_backend)
@@ -102,14 +106,13 @@ def _process_boundary_conditions(cls, boundary_conditions, bc_mask, missing_mask
             velocity_set=DefaultConfig.velocity_set,
             precision_policy=DefaultConfig.default_precision_policy,
             compute_backend=DefaultConfig.default_backend,
-            grid=cls
         )
         # Split boundary conditions by type
         bc_with_vertices = [bc for bc in boundary_conditions if bc.mesh_vertices is not None]
         bc_with_indices = [bc for bc in boundary_conditions if bc.indices is not None]
         # Process indices-based boundary conditions
         if bc_with_indices:
-            bc_mask, missing_mask = indices_masker(bc_with_indices, bc_mask, missing_mask)
+            bc_mask, missing_mask = indices_masker(bc_with_indices, bc_mask, missing_mask, xlb_grid=xlb_grid)
         # Process mesh-based boundary conditions for 3D
         if DefaultConfig.velocity_set.d == 3 and bc_with_vertices:
             mesh_masker = MeshBoundaryMasker(
@@ -396,22 +399,23 @@ def neon_get_thread_data(
 
             return _f0_thread, _f1_thread, _missing_mask
 
-        import neon, typing
+        import typing
         @neon.Container.factory(name="nse_stepper")
         def container(
                 f_0_fd: Any,
                 f_1_fd: Any,
                 bc_mask_fd: Any,
                 missing_mask_fd: Any,
+                omega: Any,
                 timestep: int,
         ):
             cast_to_store_dtype = self.store_dtype
             def nse_stepper_ll(loader: neon.Loader):
-                loader.declare_execution_scope(bc_mask_fd.get_grid())
-                f_0_pn=loader.get_read_handel(f_0_fd)
-                f_1_pn =loader.get_read_handel(f_1_fd)
-                bc_mask_pn=loader.get_read_handel(bc_mask_fd)
-                missing_mask_pn=loader.get_read_handel(missing_mask_fd)
+                loader.set_grid(bc_mask_fd.get_grid())
+                f_0_pn=(loader.get_read_handle(f_0_fd))
+                f_1_pn =loader.get_read_handle(f_1_fd)
+                bc_mask_pn=loader.get_read_handle(bc_mask_fd)
+                missing_mask_pn=loader.get_read_handle(missing_mask_fd)
 
                 @wp.func
                 def nse_stepper_cl(index: typing.Any):
@@ -430,7 +434,7 @@ def nse_stepper_cl(index: typing.Any):
 
                     _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
                     _feq = self.equilibrium.neon_functional(_rho, _u)
-                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u)
+                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
 
                     # Apply post-collision boundary conditions
                     _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False)
@@ -449,10 +453,9 @@ def nse_stepper_cl(index: typing.Any):
         return None, container
 
     @Operator.register_backend(ComputeBackend.NEON)
-    def neon_launch(self, f_0, f_1, bc_mask, missing_mask, timestep):
+    def neon_launch(self, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
         #if self.c is None:
         #    self.c = self.neon_container(f_0, f_1, bc_mask, missing_mask, timestep)
-        import wpne
         is_odd = timestep%2 == 1
         is_even = not is_odd
 
@@ -463,8 +466,8 @@ def neon_launch(self, f_0, f_1, bc_mask, missing_mask, timestep):
             c = self.c_odd
 
         if c is None:
-            c = self.neon_container(f_0, f_1, bc_mask, missing_mask, timestep)
-        c.run(0, container_runtime=wpne.Container.ContainerRuntime.neon)
+            c = self.neon_container(f_0, f_1, bc_mask, missing_mask, omega, timestep)
+        c.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
 
         if self.odd_or_even == 'even':
             self.c_even = c

From c4f21f8a11e318e8b11fee0ca13a9122b2c4b610 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Tue, 25 Feb 2025 16:19:20 +0100
Subject: [PATCH 003/208] WIP

---
 xlb/operator/boundary_condition/bc_fullway_bounce_back.py | 4 ++++
 xlb/operator/collision/bgk.py                             | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/xlb/operator/boundary_condition/bc_fullway_bounce_back.py b/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
index 995e2ff9..3e1be12b 100644
--- a/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
@@ -84,3 +84,7 @@ def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
             dim=f_pre.shape[1:],
         )
         return f_post
+
+    def _construct_neon(self):
+        functional,  _  = self._construct_warp()
+        return functional, None
\ No newline at end of file
diff --git a/xlb/operator/collision/bgk.py b/xlb/operator/collision/bgk.py
index ac2da2e0..65d47598 100644
--- a/xlb/operator/collision/bgk.py
+++ b/xlb/operator/collision/bgk.py
@@ -63,6 +63,10 @@ def kernel(
 
         return functional, kernel
 
+    def _construct_neon(self):
+        functional, _ = self._construct_warp()
+        return functional, None
+
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(self, f, feq, fout, rho, u, omega):
         # Launch the warp kernel

From a6cfd4a17a37009cf6b9e2a8672982055a5e0c63 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Tue, 25 Feb 2025 17:04:30 +0100
Subject: [PATCH 004/208] WIP

---
 examples/performance/mlups_3d_neon.py   | 42 +++++++++++++++++++++++--
 xlb/operator/macroscopic/macroscopic.py | 10 +++---
 2 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/examples/performance/mlups_3d_neon.py b/examples/performance/mlups_3d_neon.py
index bfbbc8e0..b77430ab 100644
--- a/examples/performance/mlups_3d_neon.py
+++ b/examples/performance/mlups_3d_neon.py
@@ -69,7 +69,7 @@ def setup_simulation(args):
     return backend, precision_policy
 
 
-def run(backend, precision_policy, grid_shape, num_steps):
+def run(macro, backend, precision_policy, grid_shape, num_steps):
     # Create grid and setup boundary conditions
     velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
     grid = grid_factory(grid_shape, velocity_set=velocity_set)
@@ -95,11 +95,18 @@ def run(backend, precision_policy, grid_shape, num_steps):
     # Initialize fields and run simulation
     omega = 1.0
     f_0, f_1, bc_mask, missing_mask = stepper.prepare_fields()
+    rho  = stepper.grid.create_field(1, dtype=precision_policy.store_precision)
+    u  = stepper.grid.create_field(3, dtype=precision_policy.store_precision)
+
     start_time = time.time()
 
     for i in range(num_steps):
         f_0, f_1 = stepper(f_0, f_1, bc_mask, missing_mask, omega, i)
         f_0, f_1 = f_1, f_0
+
+        if i % 10 == 0 or i == num_steps - 1:
+            wp.synchronize()
+            post_process(macro, rho, u, f_0, i)
     wp.synchronize()
 
     return time.time() - start_time
@@ -110,12 +117,43 @@ def calculate_mlups(cube_edge, num_steps, elapsed_time):
     mlups = (total_lattice_updates / elapsed_time) / 1e6
     return mlups
 
+def post_process(macro, rho, u, f_0,  i):
+    # Write the results. We'll use JAX backend for the post-processing
+    # import jax.numpy as jnp
+    # if not isinstance(f_0, jnp.ndarray):
+    #     # If the backend is warp, we need to drop the last dimension added by warp for 2D simulations
+    #     f_0 = wp.to_jax(f_0)[..., 0]
+    # else:
+    #     f_0 = f_0
+    rho, u = macro(f_0, rho, u )
+    u.export_vti(f"lid_driven_cavity_{i}.vti", 'u')
+    pass
+
+    # # remove boundary cells
+    # rho = rho[:, 1:-1, 1:-1, 1:-1]
+    # u = u[:, 1:-1, 1:-1, 1:-1]
+    # u_magnitude = (u[0] ** 2 + u[1] ** 2) ** 0.5
+    #
+    # fields = {"rho": rho[0], "u_x": u[0], "u_y": u[1], "u_magnitude": u_magnitude}
+    #
+    # # save_fields_vtk(fields, timestep=i, prefix="lid_driven_cavity")
+    # ny=fields["u_magnitude"].shape[1]
+    # from xlb.utils import  save_image
+    # save_image(fields["u_magnitude"][:, ny//2, :], timestep=i, prefix="lid_driven_cavity")
 
 def main():
+
+
     args = parse_arguments()
     backend, precision_policy = setup_simulation(args)
     grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
-    elapsed_time = run(backend, precision_policy, grid_shape, args.num_steps)
+    from xlb.operator.macroscopic import Macroscopic
+    macro = Macroscopic(
+        compute_backend=ComputeBackend.NEON,
+        precision_policy=precision_policy,
+        velocity_set=xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=ComputeBackend.NEON),
+    )
+    elapsed_time = run(macro, backend, precision_policy, grid_shape, args.num_steps)
     mlups = calculate_mlups(args.cube_edge, args.num_steps, elapsed_time)
 
     print(f"Simulation completed in {elapsed_time:.2f} seconds")
diff --git a/xlb/operator/macroscopic/macroscopic.py b/xlb/operator/macroscopic/macroscopic.py
index 5b033d9a..0c8ab43d 100644
--- a/xlb/operator/macroscopic/macroscopic.py
+++ b/xlb/operator/macroscopic/macroscopic.py
@@ -70,7 +70,7 @@ def _construct_neon(self):
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
 
         import neon, typing
-        @neon.Container.factory
+        @neon.Container.factory("macroscopic")
         def container(
                 f_field: Any,
                 rho_field: Any,
@@ -78,11 +78,11 @@ def container(
         ):
             _d = self.velocity_set.d
             def macroscopic_ll(loader: neon.Loader):
-                loader.declare_execution_scope(f_field.get_grid())
+                loader.set_grid(f_field.get_grid())
 
-                rho=loader.get_read_handel(rho_field)
-                u =loader.get_read_handel(u_fild)
-                f=loader.get_read_handel(f_field)
+                rho=loader.get_read_handle(rho_field)
+                u =loader.get_read_handle(u_fild)
+                f=loader.get_read_handle(f_field)
 
                 @wp.func
                 def macroscopic_cl(gIdx: typing.Any):

From 5bdcda60c8675471dfa6a0428a409b1cb4b8107a Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Wed, 26 Feb 2025 08:49:49 +0100
Subject: [PATCH 005/208] WIP

---
 examples/performance/mlups_3d_neon.py        |  22 ++-
 examples/performance/mlups_3d_neon_sovler.py | 152 +++++++++++++++++++
 xlb/grid/grid.py                             |   3 +
 xlb/helper/nse_solver.py                     |  62 ++++++++
 xlb/operator/macroscopic/macroscopic.py      |   5 +
 xlb/operator/operator.py                     |  12 ++
 xlb/operator/stepper/nse_stepper.py          |  81 +++++++---
 xlb/operator/stream/stream.py                |   4 +-
 8 files changed, 304 insertions(+), 37 deletions(-)
 create mode 100644 examples/performance/mlups_3d_neon_sovler.py

diff --git a/examples/performance/mlups_3d_neon.py b/examples/performance/mlups_3d_neon.py
index b77430ab..30e20df3 100644
--- a/examples/performance/mlups_3d_neon.py
+++ b/examples/performance/mlups_3d_neon.py
@@ -84,14 +84,6 @@ def run(macro, backend, precision_policy, grid_shape, num_steps):
     # Create stepper
     stepper = IncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="BGK")
 
-    # Distribute if using JAX backend
-    if backend == ComputeBackend.JAX:
-        stepper = distribute(
-            stepper,
-            grid,
-            velocity_set,
-        )
-
     # Initialize fields and run simulation
     omega = 1.0
     f_0, f_1, bc_mask, missing_mask = stepper.prepare_fields()
@@ -104,9 +96,9 @@ def run(macro, backend, precision_policy, grid_shape, num_steps):
         f_0, f_1 = stepper(f_0, f_1, bc_mask, missing_mask, omega, i)
         f_0, f_1 = f_1, f_0
 
-        if i % 10 == 0 or i == num_steps - 1:
-            wp.synchronize()
-            post_process(macro, rho, u, f_0, i)
+    #if i % 2 == 0 or i == num_steps - 1:
+        wp.synchronize()
+        post_process(macro, rho, u, f_0, i)
     wp.synchronize()
 
     return time.time() - start_time
@@ -126,7 +118,13 @@ def post_process(macro, rho, u, f_0,  i):
     # else:
     #     f_0 = f_0
     rho, u = macro(f_0, rho, u )
-    u.export_vti(f"lid_driven_cavity_{i}.vti", 'u')
+    wp.synchronize()
+    u.update_host(0)
+    rho.update_host(0)
+    wp.synchronize()
+    u.export_vti(f"u_lid_driven_cavity_{i}.vti", 'u')
+    rho.export_vti(f"rho_lid_driven_cavity_{i}.vti", 'rho')
+
     pass
 
     # # remove boundary cells
diff --git a/examples/performance/mlups_3d_neon_sovler.py b/examples/performance/mlups_3d_neon_sovler.py
new file mode 100644
index 00000000..2eb356be
--- /dev/null
+++ b/examples/performance/mlups_3d_neon_sovler.py
@@ -0,0 +1,152 @@
+from warp.examples.fem.example_convection_diffusion import velocity
+
+import xlb
+import argparse
+import time
+import warp as wp
+import numpy as np
+
+# add a directory to the PYTHON PATH
+import sys
+sys.path.append('/home/max/repos/neon/warping/neon_warp_testing/neon_py_bindings/py/')
+import neon
+
+from xlb.compute_backend import ComputeBackend
+from xlb.precision_policy import PrecisionPolicy
+from xlb.grid import grid_factory
+from xlb.operator.stepper import IncompressibleNavierStokesStepper
+from xlb.operator.boundary_condition import FullwayBounceBackBC, EquilibriumBC
+from xlb.distribute import distribute
+import xlb
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation (BGK)")
+    # Positional arguments
+    parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid")
+    parser.add_argument("num_steps", type=int, help="Timestep for the simulation")
+    parser.add_argument("backend", type=str, help="Backend for the simulation (jax, warp or neon)")
+    parser.add_argument("precision", type=str, help="Precision for the simulation (e.g., fp32/fp32)")
+
+    # Optional arguments
+    parser.add_argument("--num_devices", type=int, default=0, help="Number of devices for the simulation (default: 0)")
+    parser.add_argument("--velocity_set", type=str, default='D3Q19',
+                        help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)"
+                        )
+
+    return parser.parse_args()
+
+
+def setup_simulation(args):
+    backend = None
+    if args.backend == "jax": backend = ComputeBackend.JAX
+    elif args.backend == "warp": backend = ComputeBackend.WARP
+    elif args.backend == "neon": backend = ComputeBackend.NEON
+    if backend is None:
+        raise ValueError("Invalid backend")
+
+    precision_policy_map = {
+        "fp32/fp32": PrecisionPolicy.FP32FP32,
+        "fp64/fp64": PrecisionPolicy.FP64FP64,
+        "fp64/fp32": PrecisionPolicy.FP64FP32,
+        "fp32/fp16": PrecisionPolicy.FP32FP16,
+    }
+    precision_policy = precision_policy_map.get(args.precision)
+    if precision_policy is None:
+        raise ValueError("Invalid precision")
+
+    velocity_set = None
+    if args.velocity_set == 'D3Q19': velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
+    elif args.velocity_set == 'D3Q27': velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, backend=backend)
+    if velocity_set is None:
+        raise ValueError("Invalid velocity set")
+
+    xlb.init(
+        velocity_set=velocity_set,
+        default_backend=backend,
+        default_precision_policy=precision_policy,
+    )
+
+    return backend, precision_policy
+
+
+def run(macro, backend, precision_policy, grid_shape, num_steps):
+    # Create grid and setup boundary conditions
+    velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
+    grid = grid_factory(grid_shape, velocity_set=velocity_set)
+    box = grid.bounding_box_indices()
+    box_no_edge = grid.bounding_box_indices(remove_edges=True)
+    lid = box_no_edge["top"]
+    walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in range(len(grid.shape))]
+    walls = np.unique(np.array(walls), axis=-1).tolist()
+
+    boundary_conditions = [EquilibriumBC(rho=1.0, u=(0.02, 0.0, 0.0), indices=lid), FullwayBounceBackBC(indices=walls)]
+
+    # Create stepper
+    stepper = IncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="BGK")
+
+    # Initialize fields and run simulation
+    omega = 1.0
+
+    sim = xlb.helper.nse_solver.Nse_simulation(grid, velocity_set, stepper, omega)
+    start_time = time.time()
+
+    for i in range(num_steps):
+        sim.step()
+
+        if i % 10 == 0 or i == num_steps - 1:
+           sim.export_macroscopic("u_lid_driven_cavity_")
+    wp.synchronize()
+
+    return time.time() - start_time
+
+
+def calculate_mlups(cube_edge, num_steps, elapsed_time):
+    total_lattice_updates = cube_edge**3 * num_steps
+    mlups = (total_lattice_updates / elapsed_time) / 1e6
+    return mlups
+
+def post_process(macro, rho, u, f_0,  i):
+    # Write the results. We'll use JAX backend for the post-processing
+    # import jax.numpy as jnp
+    # if not isinstance(f_0, jnp.ndarray):
+    #     # If the backend is warp, we need to drop the last dimension added by warp for 2D simulations
+    #     f_0 = wp.to_jax(f_0)[..., 0]
+    # else:
+    #     f_0 = f_0
+    rho, u = macro(f_0, rho, u )
+    wp.synchronize()
+    u.update_host(0)
+    rho.update_host(0)
+    wp.synchronize()
+    u.export_vti(f"u_lid_driven_cavity_{i}.vti", 'u')
+    rho.export_vti(f"rho_lid_driven_cavity_{i}.vti", 'rho')
+
+    pass
+
+    # # remove boundary cells
+    # rho = rho[:, 1:-1, 1:-1, 1:-1]
+    # u = u[:, 1:-1, 1:-1, 1:-1]
+    # u_magnitude = (u[0] ** 2 + u[1] ** 2) ** 0.5
+    #
+    # fields = {"rho": rho[0], "u_x": u[0], "u_y": u[1], "u_magnitude": u_magnitude}
+    #
+    # # save_fields_vtk(fields, timestep=i, prefix="lid_driven_cavity")
+    # ny=fields["u_magnitude"].shape[1]
+    # from xlb.utils import  save_image
+    # save_image(fields["u_magnitude"][:, ny//2, :], timestep=i, prefix="lid_driven_cavity")
+
+def main():
+
+
+    args = parse_arguments()
+    backend, precision_policy = setup_simulation(args)
+    grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
+    elapsed_time = run(backend, precision_policy, grid_shape, args.num_steps)
+    mlups = calculate_mlups(args.cube_edge, args.num_steps, elapsed_time)
+
+    print(f"Simulation completed in {elapsed_time:.2f} seconds")
+    print(f"MLUPs: {mlups:.2f}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/xlb/grid/grid.py b/xlb/grid/grid.py
index 1f785362..2743a902 100644
--- a/xlb/grid/grid.py
+++ b/xlb/grid/grid.py
@@ -42,6 +42,9 @@ def __init__(self,
     def _initialize_backend(self):
         pass
 
+    def get_compute_backend(self):
+        return self.compute_backend
+
     def bounding_box_indices(self, remove_edges=False):
         """
         This function calculates the indices of the bounding box of a 2D or 3D grid.
diff --git a/xlb/helper/nse_solver.py b/xlb/helper/nse_solver.py
index c56eb07c..305402b5 100644
--- a/xlb/helper/nse_solver.py
+++ b/xlb/helper/nse_solver.py
@@ -39,3 +39,65 @@ def create_nse_fields(
     bc_mask = grid.create_field(cardinality=1, dtype=Precision.UINT8)
 
     return grid, f_0, f_1, missing_mask, bc_mask
+
+class Nse_simulation:
+    def __init__(self, grid, velocity_set, stepper, omega):
+        self.stepper = stepper
+        self.grid = stepper.get_grid()
+        self.precision_policy = stepper.get_precision_policy()
+        self.velocity_set = velocity_set
+        self.omega = omega
+
+        # Create fields
+        self.f_0, self.f_1, self.bc_mask, self.missing_mask = stepper.prepare_fields()
+        # self.f_0 = grid.create_field(cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision)
+        # self.f_1 = grid.create_field(cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision)
+        # self.missing_mask = grid.create_field(cardinality=self.velocity_set.q, dtype=Precision.UINT8)
+        # self.bc_mask = grid.create_field(cardinality=1, dtype=Precision.UINT8)
+
+        self.rho = grid.create_field(cardinality=1, dtype=self.precision_policy.store_precision)
+        self.u = grid.create_field(cardinality=3, dtype=self.precision_policy.store_precision)
+
+        self.odd_step = None
+        self.even_step = None
+        self.iteration_idx = -1
+        from xlb.operator.macroscopic import Macroscopic
+
+        self.macro = Macroscopic(
+            compute_backend=self.grid.compute_backend,
+            precision_policy=self.precision_policy,
+            velocity_set=self.velocity_set,
+        )
+        self.iteration
+
+    def __init_containers(self):
+        containers = self.stepper.get_containers(self.f_0, self.f_1, self.bc_mask, self.missing_mask, self.rho, self.u)
+        self.even_step = containers['even']
+        self.odd_step = containers['odd']
+
+        containers = self.macro.get_containers(self.f_0, self.f_1,self.rho, self.u)
+
+        self.even_macroscopic = containers['even']
+        self.odd_macroscopic = containers['odd']
+
+    def export_macroscopic(self, fname_prefix):
+        self.iteration_idx += 1
+
+        if self.iteration_idx % 2 == 0:
+            self.even_macroscopic()
+        else:
+            self.odd_macroscopic()
+
+        import warp as wp
+        wp.synchronize()
+        self.u.update_host(0)
+        self.u.export_vti(f"{fname_prefix}{self.iteration_idx}.vti", 'u')
+
+        return
+
+    def step(self):
+        self.iteration_idx += 1
+        if self.iteration_idx % 2 == 0:
+            self.even_step()
+        else:
+            self.odd_step()
diff --git a/xlb/operator/macroscopic/macroscopic.py b/xlb/operator/macroscopic/macroscopic.py
index 0c8ab43d..b7ace56c 100644
--- a/xlb/operator/macroscopic/macroscopic.py
+++ b/xlb/operator/macroscopic/macroscopic.py
@@ -98,6 +98,11 @@ def macroscopic_cl(gIdx: typing.Any):
             return macroscopic_ll
         return functional, container
 
+    def get_containers(self, f_0, f_1, rho, u):
+        _, container = self._construct_neon()
+        return {'even': container(f_0,   rho, u),
+                'odd': container(f_1,  rho, u)}
+
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, f, rho, u):
         c = self.neon_container(f, rho, u)
diff --git a/xlb/operator/operator.py b/xlb/operator/operator.py
index 7b187f24..7157f3ed 100644
--- a/xlb/operator/operator.py
+++ b/xlb/operator/operator.py
@@ -131,6 +131,18 @@ def store_dtype(self):
         elif self.compute_backend == ComputeBackend.WARP:
             return self.precision_policy.store_precision.wp_dtype
 
+    def get_precision_policy(self):
+        """
+        Returns the precision policy
+        """
+        return self.precision_policy
+
+    def get_grid(self):
+        """
+        Returns the grid object
+        """
+        return self.grid
+
     def _construct_warp(self):
         """
         Construct the warp functional and kernel of the operator
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 891f18ab..288dbc7a 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -1,6 +1,8 @@
 # Base class for all stepper operators
 
 from functools import partial
+
+from docutils.nodes import container
 from jax import jit
 import warp as wp
 import neon
@@ -37,7 +39,6 @@ def __init__(
         self.c_even = None
         self.c_odd = None
 
-
         # Construct the collision operator
         if collision_type == "BGK":
             self.collision = BGK(self.velocity_set, self.precision_policy, self.compute_backend)
@@ -88,11 +89,41 @@ def prepare_fields(self, initializer=None):
             wp.copy(f_1, f_0)
         if self.compute_backend == ComputeBackend.NEON:
             f_1.copy_from_run(f_0, 0)
-
+        if True:
+            import xlb.velocity_set
+            from xlb.operator.macroscopic import Macroscopic
+            macro = Macroscopic(
+                compute_backend=ComputeBackend.NEON,
+                precision_policy=self.precision_policy,
+                velocity_set=xlb.velocity_set.D3Q19(precision_policy=self.precision_policy, backend=ComputeBackend.NEON),
+            )
+            rho = self.grid.create_field(1, dtype=self.precision_policy.store_precision)
+            u = self.grid.create_field(3, dtype=self.precision_policy.store_precision)
+            rho, u = macro(f_0, rho, u)
+            wp.synchronize()
+            wp.synchronize()
+            u.update_host(0)
+            rho.update_host(0)
+            wp.synchronize()
+            u.export_vti("u_init.vti", 'u')
+            rho.export_vti("rho_init.vti", 'rho')
+            rho, u = macro(f_1, rho, u)
+            wp.synchronize()
+            wp.synchronize()
+            u.update_host(0)
+            rho.update_host(0)
+            wp.synchronize()
+            u.export_vti("u_f1_init.vti", 'u')
+            rho.export_vti("rho_f1_init.vti", 'rho')
         # Process boundary conditions and update masks
         bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, bc_mask, missing_mask, xlb_grid=self.grid)
         # Initialize auxiliary data if needed
         f_0, f_1 = self._initialize_auxiliary_data(self.boundary_conditions, f_0, f_1, bc_mask, missing_mask)
+        bc_mask.update_host(0)
+        missing_mask.update_host(0)
+        wp.synchronize()
+        #bc_mask.export_vti("bc_mask.vti", 'bc_mask')
+        #missing_mask.export_vti("missing_mask.vti", 'missing_mask')
 
         return f_0, f_1, bc_mask, missing_mask
 
@@ -412,17 +443,18 @@ def container(
             cast_to_store_dtype = self.store_dtype
             def nse_stepper_ll(loader: neon.Loader):
                 loader.set_grid(bc_mask_fd.get_grid())
+
                 f_0_pn=(loader.get_read_handle(f_0_fd))
-                f_1_pn =loader.get_read_handle(f_1_fd)
                 bc_mask_pn=loader.get_read_handle(bc_mask_fd)
                 missing_mask_pn=loader.get_read_handle(missing_mask_fd)
 
+                f_1_pn =loader.get_write_handle(f_1_fd)
+
                 @wp.func
                 def nse_stepper_cl(index: typing.Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id == wp.uint8(255):
                         return
-
                     # Apply streaming
                     _f_post_stream = self.stream.neon_functional(f_0_pn, index)
 
@@ -452,29 +484,32 @@ def nse_stepper_cl(index: typing.Any):
 
         return None, container
 
+    def get_containers(self, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
+        _, container = self._construct_neon()
+        return {'even': container(f_0,  bc_mask, missing_mask, omega, timestep),
+                'odd': container(f_1, f_0, bc_mask, missing_mask, omega, timestep)}
+
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_launch(self, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
         #if self.c is None:
         #    self.c = self.neon_container(f_0, f_1, bc_mask, missing_mask, timestep)
-        is_odd = timestep%2 == 1
-        is_even = not is_odd
-
-        c = None
-        if self.odd_or_even == 'even':
-            c = self.c_even
-        else:
-            c = self.c_odd
-
-        if c is None:
-            c = self.neon_container(f_0, f_1, bc_mask, missing_mask, omega, timestep)
+        # c = None
+        # if self.odd_or_even == 'even':
+        #     c = self.c_even
+        # else:
+        #     c = self.c_odd
+        #
+        # if c is None:
+        #     pass
+        c = self.neon_container(f_0, f_1, bc_mask, missing_mask, omega, timestep)
         c.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
-
-        if self.odd_or_even == 'even':
-            self.c_even = c
-        else:
-            self.c_odd = c
-
-        if self.odd_or_even == 'even':
-            self.odd_or_even = 'odd'
+        #
+        # if self.odd_or_even == 'even':
+        #     c = self.c_even
+        # else:
+        #     c = self.c_odd
+        #
+        # if self.odd_or_even == 'even':
+        #     self.odd_or_even = 'odd'
 
         return f_0, f_1
diff --git a/xlb/operator/stream/stream.py b/xlb/operator/stream/stream.py
index 5a1b99ca..0cac3c6a 100644
--- a/xlb/operator/stream/stream.py
+++ b/xlb/operator/stream/stream.py
@@ -133,8 +133,8 @@ def functional(
                 #     pull_index[d] = index[d] - _c[d, l]
 
                 ngh = wp.neon_ngh_idx(wp.int8(-_c[0, l]),
-                                      wp.int8(0),
-                                      wp.int8(-_c[1, l]))
+                                      wp.int8(-_c[1, l]),
+                                      wp.int8(-_c[2, l]))
 
                 unused_is_valid = wp.bool(False)
                 _f[l] = wp.neon_ngh_data(f, index, ngh, l, self.compute_dtype(0), unused_is_valid)

From 225114189d77381cd2b637c996a8d117c4b1bc8d Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Wed, 26 Feb 2025 11:15:50 +0100
Subject: [PATCH 006/208] WIP

---
 examples/performance/mlups_3d_neon_sovler.py | 11 +++--
 xlb/default_config.py                        |  8 ++--
 xlb/helper/nse_solver.py                     | 18 ++++----
 xlb/operator/stepper/nse_stepper.py          | 46 ++++++++++----------
 4 files changed, 42 insertions(+), 41 deletions(-)

diff --git a/examples/performance/mlups_3d_neon_sovler.py b/examples/performance/mlups_3d_neon_sovler.py
index 2eb356be..5e1c1b86 100644
--- a/examples/performance/mlups_3d_neon_sovler.py
+++ b/examples/performance/mlups_3d_neon_sovler.py
@@ -17,7 +17,6 @@
 from xlb.operator.stepper import IncompressibleNavierStokesStepper
 from xlb.operator.boundary_condition import FullwayBounceBackBC, EquilibriumBC
 from xlb.distribute import distribute
-import xlb
 
 def parse_arguments():
     parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation (BGK)")
@@ -69,7 +68,7 @@ def setup_simulation(args):
     return backend, precision_policy
 
 
-def run(macro, backend, precision_policy, grid_shape, num_steps):
+def run( backend, precision_policy, grid_shape, num_steps):
     # Create grid and setup boundary conditions
     velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
     grid = grid_factory(grid_shape, velocity_set=velocity_set)
@@ -88,16 +87,17 @@ def run(macro, backend, precision_policy, grid_shape, num_steps):
     omega = 1.0
 
     sim = xlb.helper.nse_solver.Nse_simulation(grid, velocity_set, stepper, omega)
+    print("start timing")
     start_time = time.time()
 
     for i in range(num_steps):
         sim.step()
 
-        if i % 10 == 0 or i == num_steps - 1:
-           sim.export_macroscopic("u_lid_driven_cavity_")
     wp.synchronize()
+    t = time.time() - start_time
 
-    return time.time() - start_time
+    sim.export_macroscopic("u_lid_driven_cavity_")
+    return t
 
 
 def calculate_mlups(cube_edge, num_steps, elapsed_time):
@@ -137,7 +137,6 @@ def post_process(macro, rho, u, f_0,  i):
 
 def main():
 
-
     args = parse_arguments()
     backend, precision_policy = setup_simulation(args)
     grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
diff --git a/xlb/default_config.py b/xlb/default_config.py
index 1a2aef65..ff238297 100644
--- a/xlb/default_config.py
+++ b/xlb/default_config.py
@@ -24,10 +24,10 @@ def init(velocity_set, default_backend, default_precision_policy):
         import warp as wp
         import neon
 
-        wp.config.mode = "release"
-        wp.config.llvm_cuda = False
-        wp.config.verbose = True
-        wp.verbose_warnings = True
+        #wp.config.mode = "release"
+        #wp.config.llvm_cuda = False
+        #wp.config.verbose = True
+        #wp.verbose_warnings = True
 
         wp.init()
 
diff --git a/xlb/helper/nse_solver.py b/xlb/helper/nse_solver.py
index 305402b5..f1a0108c 100644
--- a/xlb/helper/nse_solver.py
+++ b/xlb/helper/nse_solver.py
@@ -68,10 +68,11 @@ def __init__(self, grid, velocity_set, stepper, omega):
             precision_policy=self.precision_policy,
             velocity_set=self.velocity_set,
         )
-        self.iteration
+
+        self.__init_containers()
 
     def __init_containers(self):
-        containers = self.stepper.get_containers(self.f_0, self.f_1, self.bc_mask, self.missing_mask, self.rho, self.u)
+        containers = self.stepper.get_containers(self.f_0, self.f_1, self.bc_mask, self.missing_mask, self.omega, self.iteration_idx)
         self.even_step = containers['even']
         self.odd_step = containers['odd']
 
@@ -81,16 +82,15 @@ def __init_containers(self):
         self.odd_macroscopic = containers['odd']
 
     def export_macroscopic(self, fname_prefix):
-        self.iteration_idx += 1
-
         if self.iteration_idx % 2 == 0:
-            self.even_macroscopic()
+            self.even_macroscopic.run(0)
         else:
-            self.odd_macroscopic()
+            self.odd_macroscopic.run(0)
 
         import warp as wp
         wp.synchronize()
         self.u.update_host(0)
+        wp.synchronize()
         self.u.export_vti(f"{fname_prefix}{self.iteration_idx}.vti", 'u')
 
         return
@@ -98,6 +98,8 @@ def export_macroscopic(self, fname_prefix):
     def step(self):
         self.iteration_idx += 1
         if self.iteration_idx % 2 == 0:
-            self.even_step()
+            print("running even")
+            self.even_step.run(0)
         else:
-            self.odd_step()
+            print("running odd")
+            self.odd_step.run(0)
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 288dbc7a..1385900d 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -92,29 +92,29 @@ def prepare_fields(self, initializer=None):
         if True:
             import xlb.velocity_set
             from xlb.operator.macroscopic import Macroscopic
-            macro = Macroscopic(
-                compute_backend=ComputeBackend.NEON,
-                precision_policy=self.precision_policy,
-                velocity_set=xlb.velocity_set.D3Q19(precision_policy=self.precision_policy, backend=ComputeBackend.NEON),
-            )
+            # macro = Macroscopic(
+            #     compute_backend=ComputeBackend.NEON,
+            #     precision_policy=self.precision_policy,
+            #     velocity_set=xlb.velocity_set.D3Q19(precision_policy=self.precision_policy, backend=ComputeBackend.NEON),
+            # )
             rho = self.grid.create_field(1, dtype=self.precision_policy.store_precision)
             u = self.grid.create_field(3, dtype=self.precision_policy.store_precision)
-            rho, u = macro(f_0, rho, u)
-            wp.synchronize()
-            wp.synchronize()
-            u.update_host(0)
-            rho.update_host(0)
-            wp.synchronize()
-            u.export_vti("u_init.vti", 'u')
-            rho.export_vti("rho_init.vti", 'rho')
-            rho, u = macro(f_1, rho, u)
-            wp.synchronize()
-            wp.synchronize()
-            u.update_host(0)
-            rho.update_host(0)
-            wp.synchronize()
-            u.export_vti("u_f1_init.vti", 'u')
-            rho.export_vti("rho_f1_init.vti", 'rho')
+            # rho, u = macro(f_0, rho, u)
+            # wp.synchronize()
+            # wp.synchronize()
+            # u.update_host(0)
+            # rho.update_host(0)
+            # wp.synchronize()
+            # u.export_vti("u_init.vti", 'u')
+            # rho.export_vti("rho_init.vti", 'rho')
+            # rho, u = macro(f_1, rho, u)
+            # wp.synchronize()
+            # wp.synchronize()
+            # u.update_host(0)
+            # rho.update_host(0)
+            # wp.synchronize()
+            # u.export_vti("u_f1_init.vti", 'u')
+            # rho.export_vti("rho_f1_init.vti", 'rho')
         # Process boundary conditions and update masks
         bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, bc_mask, missing_mask, xlb_grid=self.grid)
         # Initialize auxiliary data if needed
@@ -486,8 +486,8 @@ def nse_stepper_cl(index: typing.Any):
 
     def get_containers(self, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
         _, container = self._construct_neon()
-        return {'even': container(f_0,  bc_mask, missing_mask, omega, timestep),
-                'odd': container(f_1, f_0, bc_mask, missing_mask, omega, timestep)}
+        return {'even': container(f_0, f_1,  bc_mask, missing_mask, omega, 0),
+                'odd': container(f_1, f_0, bc_mask, missing_mask, omega, 1)}
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_launch(self, f_0, f_1, bc_mask, missing_mask,  omega, timestep):

From 6a5fe5c2ad7ce89f030bdc03555182687d5725dd Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Wed, 26 Feb 2025 14:44:42 +0100
Subject: [PATCH 007/208] WIP

---
 xlb/operator/boundary_masker/indices_boundary_masker.py | 2 +-
 xlb/operator/macroscopic/macroscopic.py                 | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 9d38615f..63c292a4 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -158,7 +158,7 @@ def kernel(
         return None, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
-    def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
+    def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None, xlb_grid=None):
         # Pre-allocate arrays with maximum possible size
         max_size = sum(len(bc.indices[0]) if isinstance(bc.indices, list) else bc.indices.shape[1] for bc in bclist if bc.indices is not None)
         indices = np.zeros((3, max_size), dtype=np.int32)
diff --git a/xlb/operator/macroscopic/macroscopic.py b/xlb/operator/macroscopic/macroscopic.py
index b7ace56c..586f65c7 100644
--- a/xlb/operator/macroscopic/macroscopic.py
+++ b/xlb/operator/macroscopic/macroscopic.py
@@ -26,8 +26,8 @@ def jax_implementation(self, f):
         return rho, u
 
     def _construct_warp(self):
-        zero_moment_func = self.zero_moment.neon_functional
-        first_moment_func = self.first_moment.neon_functional
+        zero_moment_func = self.zero_moment.warp_functional
+        first_moment_func = self.first_moment.warp_functional
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
 
         @wp.func

From d0f91b7a2c02d63b4912b1cdbcbd7c79d24e11e7 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Fri, 28 Feb 2025 18:31:47 +0100
Subject: [PATCH 008/208] WIP

---
 examples/performance/mlups_3d_neon_sovler.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/examples/performance/mlups_3d_neon_sovler.py b/examples/performance/mlups_3d_neon_sovler.py
index 5e1c1b86..eb4ee8b9 100644
--- a/examples/performance/mlups_3d_neon_sovler.py
+++ b/examples/performance/mlups_3d_neon_sovler.py
@@ -78,13 +78,20 @@ def run( backend, precision_policy, grid_shape, num_steps):
     walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in range(len(grid.shape))]
     walls = np.unique(np.array(walls), axis=-1).tolist()
 
-    boundary_conditions = [EquilibriumBC(rho=1.0, u=(0.02, 0.0, 0.0), indices=lid), FullwayBounceBackBC(indices=walls)]
+    prescribed_vel = 0.05
+
+    boundary_conditions = [EquilibriumBC(rho=1.0, u=(prescribed_vel, 0.0, 0.0), indices=lid), FullwayBounceBackBC(indices=walls)]
 
     # Create stepper
     stepper = IncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="BGK")
 
-    # Initialize fields and run simulation
-    omega = 1.0
+    Re = 100000
+    clength = grid_shape[0] - 1
+    visc = prescribed_vel * clength / Re
+    omega = 1.0 / (3.0 * visc + 0.5)
+
+    # # Initialize fields and run simulation
+    # omega = 1.0
 
     sim = xlb.helper.nse_solver.Nse_simulation(grid, velocity_set, stepper, omega)
     print("start timing")

From b529acb8422b91605515c3f93de6691a38a5b397 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Mon, 3 Mar 2025 13:33:32 +0100
Subject: [PATCH 009/208] WIP

---
 examples/performance/mlups_3d_neon_sovler.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/performance/mlups_3d_neon_sovler.py b/examples/performance/mlups_3d_neon_sovler.py
index eb4ee8b9..a5c65af2 100644
--- a/examples/performance/mlups_3d_neon_sovler.py
+++ b/examples/performance/mlups_3d_neon_sovler.py
@@ -85,7 +85,7 @@ def run( backend, precision_policy, grid_shape, num_steps):
     # Create stepper
     stepper = IncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="BGK")
 
-    Re = 100000
+    Re = 10000.0
     clength = grid_shape[0] - 1
     visc = prescribed_vel * clength / Re
     omega = 1.0 / (3.0 * visc + 0.5)
@@ -99,7 +99,8 @@ def run( backend, precision_policy, grid_shape, num_steps):
 
     for i in range(num_steps):
         sim.step()
-
+        if i%500 == 0:
+            sim.export_macroscopic("u_lid_driven_cavity_")
     wp.synchronize()
     t = time.time() - start_time
 

From b2713979f26441ed3b43d246580a8f97518fa406 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Mon, 3 Mar 2025 15:28:10 +0100
Subject: [PATCH 010/208] WIP

---
 xlb/grid/neon_grid.py                   |  7 +++----
 xlb/helper/nse_solver.py                | 10 +++++++---
 xlb/operator/macroscopic/macroscopic.py | 12 +++++++++++-
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/xlb/grid/neon_grid.py b/xlb/grid/neon_grid.py
index 2f74598a..d8ce71c9 100644
--- a/xlb/grid/neon_grid.py
+++ b/xlb/grid/neon_grid.py
@@ -23,10 +23,6 @@ def __init__(self, shape, velocity_set):
     def _get_velocity_set(self):
         return self.xlb_lattice
 
-    def _initialize_backend(self):
-        # do nothing
-        pass
-
     def _initialize_backend(self):
 
         # FIXME@max: for now we hardcode the number of devices to 0
@@ -131,3 +127,6 @@ def cloning(gridIdx: typing.Any):
         c.run(0)
         wp.synchronize()
         return warp_field
+
+    def get_neon_backend(self):
+        return self.bk
diff --git a/xlb/helper/nse_solver.py b/xlb/helper/nse_solver.py
index f1a0108c..b22a9491 100644
--- a/xlb/helper/nse_solver.py
+++ b/xlb/helper/nse_solver.py
@@ -2,7 +2,7 @@
 from xlb.grid import grid_factory
 from xlb.precision_policy import Precision
 from typing import Tuple
-
+import neon
 
 def create_nse_fields(
     grid_shape: Tuple[int, int, int] = None,
@@ -81,6 +81,12 @@ def __init_containers(self):
         self.even_macroscopic = containers['even']
         self.odd_macroscopic = containers['odd']
 
+        self.skeleton_even = neon.Skeleton(self.grid.get_neon_backend())
+        self.skeleton_odd = neon.Skeleton(self.grid.get_neon_backend())
+
+        self.skeleton_even.sequence(name="even lbm", containers=[self.even_step])
+        self.skeleton_odd.sequence(name="odd lbm", containers=[self.odd_step])
+
     def export_macroscopic(self, fname_prefix):
         if self.iteration_idx % 2 == 0:
             self.even_macroscopic.run(0)
@@ -98,8 +104,6 @@ def export_macroscopic(self, fname_prefix):
     def step(self):
         self.iteration_idx += 1
         if self.iteration_idx % 2 == 0:
-            print("running even")
             self.even_step.run(0)
         else:
-            print("running odd")
             self.odd_step.run(0)
diff --git a/xlb/operator/macroscopic/macroscopic.py b/xlb/operator/macroscopic/macroscopic.py
index 586f65c7..7b046fbf 100644
--- a/xlb/operator/macroscopic/macroscopic.py
+++ b/xlb/operator/macroscopic/macroscopic.py
@@ -66,7 +66,17 @@ def warp_implementation(self, f, rho, u):
         return rho, u
 
     def _construct_neon(self):
-        functional, _ = self._construct_warp()
+        zero_moment_func = self.zero_moment.neon_functional
+        first_moment_func = self.first_moment.neon_functional
+        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
+
+        @wp.func
+        def functional(f: _f_vec):
+            rho = zero_moment_func(f)
+            u = first_moment_func(f, rho)
+            return rho, u
+
+
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
 
         import neon, typing

From 7040de1a0dcf11c2c8ef2ac8275af3f282c976bc Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Wed, 5 Mar 2025 17:43:43 +0100
Subject: [PATCH 011/208] WIP

---
 xlb/grid/__init__.py                          |   1 +
 xlb/grid/grid.py                              |  29 +-
 xlb/grid/multires_grid.py                     | 146 +++++++++
 xlb/helper/__init__.py                        |   2 +
 xlb/helper/initializers.py                    |  20 +-
 xlb/helper/nse_multires_solver.py             |  85 +++++
 .../indices_boundary_masker.py                |   6 +-
 xlb/operator/equilibrium/__init__.py          |   1 +
 .../mulltires_quadratic_equilibrium.py        |  97 ++++++
 xlb/operator/macroscopic/__init__.py          |   1 +
 .../macroscopic/multires_macroscopic.py       | 120 +++++++
 xlb/operator/stepper/__init__.py              |   1 +
 xlb/operator/stepper/nse_multires_stepper.py  | 292 ++++++++++++++++++
 13 files changed, 795 insertions(+), 6 deletions(-)
 create mode 100644 xlb/grid/multires_grid.py
 create mode 100644 xlb/helper/nse_multires_solver.py
 create mode 100644 xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
 create mode 100644 xlb/operator/macroscopic/multires_macroscopic.py
 create mode 100644 xlb/operator/stepper/nse_multires_stepper.py

diff --git a/xlb/grid/__init__.py b/xlb/grid/__init__.py
index 7d9ec24b..4fd48394 100644
--- a/xlb/grid/__init__.py
+++ b/xlb/grid/__init__.py
@@ -1 +1,2 @@
 from xlb.grid.grid import grid_factory as grid_factory
+from xlb.grid.grid import multires_grid_factory as multires_grid_factory
diff --git a/xlb/grid/grid.py b/xlb/grid/grid.py
index 2743a902..4ab6cfee 100644
--- a/xlb/grid/grid.py
+++ b/xlb/grid/grid.py
@@ -1,10 +1,10 @@
 from abc import ABC, abstractmethod
-from typing import Tuple
+from typing import Tuple, List
 import numpy as np
 
 from xlb import DefaultConfig
 from xlb.compute_backend import ComputeBackend
-
+import neon
 
 def grid_factory(shape: Tuple[int, ...],
                  compute_backend: ComputeBackend = None,
@@ -27,6 +27,31 @@ def grid_factory(shape: Tuple[int, ...],
 
     raise ValueError(f"Compute backend {compute_backend} is not supported")
 
+def multires_grid_factory(shape: Tuple[int, ...],
+                 compute_backend: ComputeBackend = None,
+                 velocity_set=None,
+                 sparsity_pattern_list: List[np.ndarray] = [],
+                 sparsity_pattern_origins: List[neon.Index_3d]=[],
+                 ):
+
+    compute_backend = compute_backend or DefaultConfig.default_backend
+    if compute_backend == ComputeBackend.WARP:
+        from xlb.grid.warp_grid import WarpGrid
+        raise ValueError(f"Compute backend {compute_backend} is not supported for multires grid")
+
+    if compute_backend == ComputeBackend.NEON:
+        from xlb.grid.multires_grid import NeonMultiresGrid
+
+        return NeonMultiresGrid(shape=shape,
+                                velocity_set=velocity_set,
+                                sparsity_pattern_list = sparsity_pattern_list,
+                                sparsity_pattern_origins=  sparsity_pattern_origins)
+
+    elif compute_backend == ComputeBackend.JAX:
+        raise ValueError(f"Compute backend {compute_backend} is not supported for multires grid")
+
+    raise ValueError(f"Compute backend {compute_backend} is not supported for multires grid")
+
 
 class Grid(ABC):
     def __init__(self,
diff --git a/xlb/grid/multires_grid.py b/xlb/grid/multires_grid.py
new file mode 100644
index 00000000..5803e027
--- /dev/null
+++ b/xlb/grid/multires_grid.py
@@ -0,0 +1,146 @@
+import numpy as np
+import warp as wp
+import neon
+from .grid import Grid
+from xlb.precision_policy import Precision
+from xlb.compute_backend import ComputeBackend
+from typing import Literal, List
+from xlb import DefaultConfig
+
+
+class NeonMultiresGrid(Grid):
+    def __init__(self, shape,
+                 velocity_set,
+                 sparsity_pattern_list: List[np.ndarray],
+                 sparsity_pattern_origins: List[neon.Index_3d],):
+        from .warp_grid import WarpGrid
+
+        self.bk = None
+        self.dim = None
+        self.grid = None
+        self.xlb_lattice = velocity_set
+        self.warp_grid = WarpGrid(shape)
+        self.sparsity_pattern_list = sparsity_pattern_list
+        self.sparsity_pattern_origins = sparsity_pattern_origins
+        self.count_levels = len(sparsity_pattern_list)
+
+        super().__init__(shape, ComputeBackend.NEON)
+
+    def _get_velocity_set(self):
+        return self.xlb_lattice
+
+    def _initialize_backend(self):
+
+        # FIXME@max: for now we hardcode the number of devices to 0
+        num_devs = 1
+        dev_idx_list = list(range(num_devs))
+
+        if len(self.shape) == 2:
+            import py_neon
+            self.dim = py_neon.Index_3d(self.shape[0],
+                                        1,
+                                        self.shape[1])
+            self.neon_stencil = []
+            for c_idx in range(len(self.xlb_lattice._c[0])):
+                xval = self.xlb_lattice._c[0][c_idx]
+                yval = self.xlb_lattice._c[1][c_idx]
+                self.neon_stencil.append([xval, 0, yval])
+
+        else:
+            self.dim = neon.Index_3d(self.shape[0],
+                                        self.shape[1],
+                                        self.shape[2])
+
+            self.neon_stencil = []
+            for c_idx in range(len(self.xlb_lattice._c[0])):
+                xval = self.xlb_lattice._c[0][c_idx]
+                yval = self.xlb_lattice._c[1][c_idx]
+                zval = self.xlb_lattice._c[2][c_idx]
+                self.neon_stencil.append([xval, yval, zval])
+
+        self.bk = neon.Backend(
+            runtime=neon.Backend.Runtime.stream,
+            dev_idx_list=dev_idx_list)
+
+        """
+         backend: neon.Backend,
+         dim,
+         sparsity_pattern_list: List[np.ndarray],
+         sparsity_pattern_origins: List[neon.Index_3d],
+         stencil: List[List[int]]):"""
+        self.grid = neon.multires.mGrid(
+            backend=self.bk,
+            dim=self.dim,
+            sparsity_pattern_list=self.sparsity_pattern_list,
+            sparsity_pattern_origins=self.sparsity_pattern_origins,
+            stencil=self.neon_stencil)
+        pass
+
+    def create_field(
+            self,
+            cardinality: int,
+            dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None,
+            fill_value=None,
+    ):
+        dtype = dtype.wp_dtype if dtype else DefaultConfig.default_precision_policy.store_precision.wp_dtype
+        field = self.grid.new_field(cardinality=cardinality,
+                                    dtype=dtype, )
+        for l in range(self.count_levels):
+            if fill_value is None:
+                field.zero_run(l, stream_idx = 0)
+            else:
+                field.fill_run(level= l, value=fill_value,stream_idx = 0)
+        return field
+
+    def _create_warp_field(self,
+                           cardinality: int,
+                           dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None,
+                           fill_value=None,
+                           ne_field=None
+    ):
+        print("WARNING: allocating warp fields for mres is temporary and only a work around!")
+        warp_field = self.warp_grid.create_field(cardinality, dtype, fill_value)
+        if ne_field is None:
+            return warp_field
+
+        _d = self.xlb_lattice.d
+
+        import typing
+        @neon.Container.factory(mame="cloning-warp")
+        def container(
+                src_field: typing.Any,
+                dst_field: typing.Any,
+                cardinality: wp.int32
+        ):
+            def loading_step(loader: neon.Loader):
+                loader.declare_execution_scope(self.grid, level=0)
+                src_pn = loader.get_read_handel(src_field)
+
+                @wp.func
+                def cloning(gridIdx: typing.Any):
+                    cIdx = wp.neon_global_idx(src_pn, gridIdx)
+                    gx = wp.neon_get_x(cIdx)
+                    gy = wp.neon_get_y(cIdx)
+                    gz = wp.neon_get_z(cIdx)
+
+                    # TODO@Max - XLB is flattening the z dimension in 3D, while neon uses the y dimension
+                    if _d == 2:
+                        gy, gz = gz, gy
+
+                    for card in range(cardinality):
+                        value = wp.neon_read(src_pn,
+                                      gridIdx,
+                                      card)
+                        dst_field[card, gx, gy, gz] = value
+
+                loader.declare_kernel(cloning)
+
+            return loading_step
+
+        c = container(src_field=ne_field, dst_field=warp_field, cardinality=cardinality)
+        c.run(0)
+        wp.synchronize()
+        return warp_field
+
+    def get_neon_backend(self):
+        return self.bk
diff --git a/xlb/helper/__init__.py b/xlb/helper/__init__.py
index d52f2063..1a3de972 100644
--- a/xlb/helper/__init__.py
+++ b/xlb/helper/__init__.py
@@ -1,3 +1,5 @@
 from xlb.helper.nse_solver import create_nse_fields
 from xlb.helper.initializers import initialize_eq
+from xlb.helper.initializers import initialize_multires_eq
 from xlb.helper.check_boundary_overlaps import check_bc_overlaps
+from xlb.helper.nse_multires_solver import Nse_multires_simulation
diff --git a/xlb/helper/initializers.py b/xlb/helper/initializers.py
index 40f7f9a4..0b07f480 100644
--- a/xlb/helper/initializers.py
+++ b/xlb/helper/initializers.py
@@ -1,6 +1,6 @@
 from xlb.compute_backend import ComputeBackend
 from xlb.operator.equilibrium import QuadraticEquilibrium
-
+from xlb.operator.equilibrium import MultiresQuadraticEquilibrium
 
 def initialize_eq(f, grid, velocity_set, precision_policy, backend, rho=None, u=None):
     if rho is None:
@@ -24,3 +24,21 @@ def initialize_eq(f, grid, velocity_set, precision_policy, backend, rho=None, u=
     del rho, u
 
     return f
+
+
+def initialize_multires_eq(f, grid, velocity_set, precision_policy, backend, rho=None, u=None):
+    if rho is None:
+        rho = grid.create_field(cardinality=1, fill_value=1.0, dtype=precision_policy.compute_precision)
+    if u is None:
+        u = grid.create_field(cardinality=velocity_set.d, fill_value=0.0, dtype=precision_policy.compute_precision)
+    equilibrium = MultiresQuadraticEquilibrium()
+    if backend == ComputeBackend.NEON:
+        for level in range(grid.count_levels):
+            equilibrium(level,rho, u, f)
+        pass
+    else:
+        raise NotImplementedError(f"Backend {backend} not implemented")
+
+    del rho, u
+
+    return f
\ No newline at end of file
diff --git a/xlb/helper/nse_multires_solver.py b/xlb/helper/nse_multires_solver.py
new file mode 100644
index 00000000..ebbc1b03
--- /dev/null
+++ b/xlb/helper/nse_multires_solver.py
@@ -0,0 +1,85 @@
+import numpy as np
+
+from xlb import DefaultConfig
+from xlb.grid.multires_grid import NeonMultiresGrid
+from xlb.precision_policy import Precision
+from typing import Tuple, List
+import neon
+
+
+class Nse_multires_simulation:
+    def __init__(self, grid, velocity_set, stepper, omega):
+        self.stepper = stepper
+        self.grid = stepper.get_grid()
+        self.precision_policy = stepper.get_precision_policy()
+        self.velocity_set = velocity_set
+        self.omega = omega
+        count_levels = grid.count_levels
+        # Create fields
+        self.f_0, self.f_1, self.bc_mask, self.missing_mask = stepper.prepare_fields()
+        # self.f_0 = grid.create_field(cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision)
+        # self.f_1 = grid.create_field(cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision)
+        # self.missing_mask = grid.create_field(cardinality=self.velocity_set.q, dtype=Precision.UINT8)
+        # self.bc_mask = grid.create_field(cardinality=1, dtype=Precision.UINT8)
+
+        self.rho = grid.create_field(cardinality=1, dtype=self.precision_policy.store_precision)
+        self.u = grid.create_field(cardinality=3, dtype=self.precision_policy.store_precision)
+
+        self.odd_step = None
+        self.even_step = None
+        self.iteration_idx = -1
+        from xlb.operator.macroscopic import MultiresMacroscopic
+
+        self.macro = MultiresMacroscopic(
+            compute_backend=self.grid.compute_backend,
+            precision_policy=self.precision_policy,
+            velocity_set=self.velocity_set,
+        )
+
+        self.__init_containers(count_levels)
+
+    def __init_containers(self, num_levels):
+        # working only with level 0 for now
+        target_level = 0
+        containers = self.stepper.get_containers(target_level,
+                                                 self.f_0,
+                                                 self.f_1,
+                                                 self.bc_mask,
+                                                 self.missing_mask,
+                                                 self.omega,
+                                                 self.iteration_idx)
+
+        self.even_step = containers['even']
+        self.odd_step = containers['odd']
+
+        containers = self.macro.get_containers(target_level, self.f_0, self.f_1,self.rho, self.u)
+
+        self.even_macroscopic = containers['even']
+        self.odd_macroscopic = containers['odd']
+
+        self.skeleton_even = neon.Skeleton(self.grid.get_neon_backend())
+        self.skeleton_odd = neon.Skeleton(self.grid.get_neon_backend())
+
+        self.skeleton_even.sequence(name="even lbm", containers=[self.even_step])
+        self.skeleton_odd.sequence(name="odd lbm", containers=[self.odd_step])
+
+    def export_macroscopic(self, fname_prefix):
+        if self.iteration_idx % 2 == 0:
+            self.even_macroscopic.run(0)
+        else:
+            self.odd_macroscopic.run(0)
+
+        import warp as wp
+        wp.synchronize()
+        self.u.update_host(0)
+        wp.synchronize()
+        self.u.export_vti(f"{fname_prefix}{self.iteration_idx}.vti", 'u')
+
+        return
+
+    def step(self):
+        self.iteration_idx += 1
+        if self.iteration_idx % 2 == 0:
+            self.even_step.run(0)
+        else:
+            self.odd_step.run(0)
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 63c292a4..3eed597f 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -303,10 +303,10 @@ def container(
                 missing_mask_field: typing.Any,
         ):
             def loading_step(loader: neon.Loader):
-                loader.set_grid(bc_mask.get_grid())
+                loader.set_mres_grid(bc_mask.get_grid(), 0)
 
-                bc_mask_hdl = loader.get_read_handle(bc_mask_field)
-                missing_mask_hdl = loader.get_read_handle(missing_mask_field)
+                bc_mask_hdl = loader.get_mres_write_handle(bc_mask_field)
+                missing_mask_hdl = loader.get_mres_write_handle(missing_mask_field)
 
                 @wp.func
                 def masker(gridIdx: typing.Any):
diff --git a/xlb/operator/equilibrium/__init__.py b/xlb/operator/equilibrium/__init__.py
index 987aa74a..474bdf5c 100644
--- a/xlb/operator/equilibrium/__init__.py
+++ b/xlb/operator/equilibrium/__init__.py
@@ -1 +1,2 @@
 from xlb.operator.equilibrium.quadratic_equilibrium import Equilibrium, QuadraticEquilibrium
+from xlb.operator.equilibrium.mulltires_quadratic_equilibrium import MultiresQuadraticEquilibrium
diff --git a/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py b/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
new file mode 100644
index 00000000..fce7ae1a
--- /dev/null
+++ b/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
@@ -0,0 +1,97 @@
+from functools import partial
+import jax.numpy as jnp
+from jax import jit
+import warp as wp
+import os
+
+# Print the PYTHONPATH
+pythonpath = os.environ.get('PYTHONPATH', 'PYTHONPATH is not set')
+print(f"PYTHONPATH: {pythonpath}")
+import neon
+from typing import Any
+
+from xlb.compute_backend import ComputeBackend
+from xlb.operator.equilibrium.equilibrium import Equilibrium
+from xlb.operator import Operator
+
+
+class MultiresQuadraticEquilibrium(Equilibrium):
+    """
+    Quadratic equilibrium of Boltzmann equation using hermite polynomials.
+    Standard equilibrium model for LBM.
+    """
+
+    def _construct_neon(self):
+        import neon
+        # Set local constants TODO: This is a hack and should be fixed with warp update
+        _c = self.velocity_set.c
+        _w = self.velocity_set.w
+        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
+        _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
+
+        # Construct the equilibrium functional
+        @wp.func
+        def functional(
+            rho: Any,
+            u: Any,
+        ):
+            # Allocate the equilibrium
+            feq = _f_vec()
+
+            # Compute the equilibrium
+            for l in range(self.velocity_set.q):
+                # Compute cu
+                cu = self.compute_dtype(0.0)
+                for d in range(self.velocity_set.d):
+                    if _c[d, l] == 1:
+                        cu += u[d]
+                    elif _c[d, l] == -1:
+                        cu -= u[d]
+                cu *= self.compute_dtype(3.0)
+
+                # Compute usqr
+                usqr = self.compute_dtype(1.5) * wp.dot(u, u)
+
+                # Compute feq
+                feq[l] = rho * _w[l] * (self.compute_dtype(1.0) + cu * (self.compute_dtype(1.0) + self.compute_dtype(0.5) * cu) - usqr)
+
+            return feq
+
+        import typing
+        @neon.Container.factory(name="QuadraticEquilibrium")
+        def container(
+                level,
+            rho: Any,
+            u: Any,
+            f: Any,
+        ):
+
+            def quadratic_equilibrium_ll(loader:neon.Loader):
+                loader.set_mres_grid(rho.get_grid(), level)
+
+                rho_pn=loader.get_mres_read_handle(rho)
+                u_pn =loader.get_mres_read_handle(u)
+                f_pn=loader.get_mres_write_handle(f)
+
+                @wp.func
+                def quadratic_equilibrium_cl(index: typing.Any):
+                    _u = _u_vec()
+                    for d in range(self.velocity_set.d):
+                        _u[d] = wp.neon_read(u_pn, index, d)
+                    _rho = wp.neon_read(rho_pn, index, 0)
+                    feq = functional(_rho, _u)
+
+                    # Set the output
+                    for l in range(self.velocity_set.q):
+                        #wp.neon_write(f_pn, index, l, self.store_dtype(feq[l]))
+                        wp.neon_write(f_pn, index, l, feq[l])
+                loader.declare_kernel(quadratic_equilibrium_cl)
+            return quadratic_equilibrium_ll
+        return functional, container
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(self, leve, rho, u, f):
+        c = self.neon_container( leve, rho, u, f)
+        c.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
+
+        return f
diff --git a/xlb/operator/macroscopic/__init__.py b/xlb/operator/macroscopic/__init__.py
index 75dec9ea..75eacee6 100644
--- a/xlb/operator/macroscopic/__init__.py
+++ b/xlb/operator/macroscopic/__init__.py
@@ -2,3 +2,4 @@
 from xlb.operator.macroscopic.second_moment import SecondMoment
 from xlb.operator.macroscopic.zero_moment import ZeroMoment
 from xlb.operator.macroscopic.first_moment import FirstMoment
+from xlb.operator.macroscopic.multires_macroscopic import MultiresMacroscopic
diff --git a/xlb/operator/macroscopic/multires_macroscopic.py b/xlb/operator/macroscopic/multires_macroscopic.py
new file mode 100644
index 00000000..d79428cc
--- /dev/null
+++ b/xlb/operator/macroscopic/multires_macroscopic.py
@@ -0,0 +1,120 @@
+from functools import partial
+import jax.numpy as jnp
+from jax import jit
+import warp as wp
+from typing import Any
+
+from xlb.compute_backend import ComputeBackend
+from xlb.operator.operator import Operator
+from xlb.operator.macroscopic.zero_moment import ZeroMoment
+from xlb.operator.macroscopic.first_moment import FirstMoment
+
+
+class MultiresMacroscopic(Operator):
+    """A class to compute both zero and first moments of distribution functions (rho, u)."""
+
+    def __init__(self, *args, **kwargs):
+        self.zero_moment = ZeroMoment(*args, **kwargs)
+        self.first_moment = FirstMoment(*args, **kwargs)
+        super().__init__(*args, **kwargs)
+
+
+    def _construct_warp(self):
+        zero_moment_func = self.zero_moment.warp_functional
+        first_moment_func = self.first_moment.warp_functional
+        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
+
+        @wp.func
+        def functional(f: _f_vec):
+            rho = zero_moment_func(f)
+            u = first_moment_func(f, rho)
+            return rho, u
+
+        @wp.kernel
+        def kernel(
+            f: wp.array4d(dtype=Any),
+            rho: wp.array4d(dtype=Any),
+            u: wp.array4d(dtype=Any),
+        ):
+            i, j, k = wp.tid()
+            index = wp.vec3i(i, j, k)
+
+            _f = _f_vec()
+            for l in range(self.velocity_set.q):
+                _f[l] = f[l, index[0], index[1], index[2]]
+            _rho, _u = functional(_f)
+
+            rho[0, index[0], index[1], index[2]] = self.store_dtype(_rho)
+            for d in range(self.velocity_set.d):
+                u[d, index[0], index[1], index[2]] = self.store_dtype(_u[d])
+
+        return functional, kernel
+
+    @Operator.register_backend(ComputeBackend.WARP)
+    def warp_implementation(self, f, rho, u):
+        wp.launch(
+            self.warp_kernel,
+            inputs=[f, rho, u],
+            dim=rho.shape[1:],
+        )
+        return rho, u
+
+    def _construct_neon(self):
+        zero_moment_func = self.zero_moment.neon_functional
+        first_moment_func = self.first_moment.neon_functional
+        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
+
+        @wp.func
+        def functional(f: _f_vec):
+            rho = zero_moment_func(f)
+            u = first_moment_func(f, rho)
+            return rho, u
+
+
+        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
+
+        import neon, typing
+        @neon.Container.factory("macroscopic")
+        def container(
+                level: int,
+                f_field: Any,
+                rho_field: Any,
+                u_fild: Any,
+        ):
+            _d = self.velocity_set.d
+            def macroscopic_ll(loader: neon.Loader):
+                loader.set_mres_grid(f_field.get_grid(), level)
+
+                rho=loader.get_mres_read_handle(rho_field)
+                u =loader.get_mres_read_handle(u_fild)
+                f=loader.get_mres_write_handle(f_field)
+
+                @wp.func
+                def macroscopic_cl(gIdx: typing.Any):
+                    _f = _f_vec()
+                    for l in range(self.velocity_set.q):
+                        _f[l] = wp.neon_read(f, gIdx,l)
+                    _rho, _u = functional(_f)
+                    wp.neon_write(rho, gIdx, 0, _rho)
+                    for d in range(_d):
+                        wp.neon_write(u, gIdx, d, _u[d])
+
+                loader.declare_kernel(macroscopic_cl)
+            return macroscopic_ll
+        return functional, container
+
+    def get_containers(self, target_level, f_0, f_1, rho, u):
+        _, container = self._construct_neon()
+        evenList = []
+        oddList = []
+        evenList.append(container(target_level, f_0,   rho, u))
+        oddList.append( container(target_level, f_1,  rho, u))
+        return {'even':evenList ,
+                'odd':oddList }
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(self, f, rho, u):
+        c = self.neon_container(f, rho, u)
+        c.run(0)
+        wp.synchronize()
+        return rho, u
diff --git a/xlb/operator/stepper/__init__.py b/xlb/operator/stepper/__init__.py
index e5d159c6..1c9722f1 100644
--- a/xlb/operator/stepper/__init__.py
+++ b/xlb/operator/stepper/__init__.py
@@ -1,2 +1,3 @@
 from xlb.operator.stepper.stepper import Stepper
 from xlb.operator.stepper.nse_stepper import IncompressibleNavierStokesStepper
+from xlb.operator.stepper.nse_multires_stepper import MultiresIncompressibleNavierStokesStepper
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
new file mode 100644
index 00000000..aa19f82c
--- /dev/null
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -0,0 +1,292 @@
+# Base class for all stepper operators
+
+from functools import partial
+
+from docutils.nodes import container
+from jax import jit
+import warp as wp
+import neon
+from typing import Any
+
+from xlb import DefaultConfig
+from xlb.compute_backend import ComputeBackend
+from xlb.precision_policy import Precision
+from xlb.operator import Operator
+from xlb.operator.stream import Stream
+from xlb.operator.collision import BGK, KBC
+from xlb.operator.equilibrium import QuadraticEquilibrium
+from xlb.operator.macroscopic import Macroscopic
+from xlb.operator.stepper import Stepper
+from xlb.operator.boundary_condition.boundary_condition import ImplementationStep
+from xlb.operator.boundary_condition.boundary_condition_registry import boundary_condition_registry
+from xlb.operator.collision import ForcedCollision
+from xlb.operator.boundary_masker import IndicesBoundaryMasker, MeshBoundaryMasker
+from xlb.helper import check_bc_overlaps
+from xlb.helper.nse_solver import create_nse_fields
+
+
+class MultiresIncompressibleNavierStokesStepper(Stepper):
+    def __init__(
+        self,
+        grid,
+        boundary_conditions=[],
+        collision_type="BGK",
+        forcing_scheme="exact_difference",
+        force_vector=None,
+    ):
+        super().__init__(grid, boundary_conditions)
+        self.odd_or_even='even'
+        self.c_even = None
+        self.c_odd = None
+
+        # Construct the collision operator
+        if collision_type == "BGK":
+            self.collision = BGK(self.velocity_set, self.precision_policy, self.compute_backend)
+        elif collision_type == "KBC":
+            self.collision = KBC(self.velocity_set, self.precision_policy, self.compute_backend)
+
+        if force_vector is not None:
+            self.collision = ForcedCollision(collision_operator=self.collision, forcing_scheme=forcing_scheme, force_vector=force_vector)
+
+        # Construct the operators
+        self.stream = Stream(self.velocity_set, self.precision_policy, self.compute_backend)
+        self.equilibrium = QuadraticEquilibrium(self.velocity_set, self.precision_policy, self.compute_backend)
+        self.macroscopic = Macroscopic(self.velocity_set, self.precision_policy, self.compute_backend)
+
+    def prepare_fields(self, initializer=None):
+        """Prepare the fields required for the stepper.
+
+        Args:
+            initializer: Optional operator to initialize the distribution functions.
+                        If provided, it should be a callable that takes (grid, velocity_set,
+                        precision_policy, compute_backend) as arguments and returns initialized f_0.
+                        If None, default equilibrium initialization is used with rho=1 and u=0.
+
+        Returns:
+            Tuple of (f_0, f_1, bc_mask, missing_mask):
+                - f_0: Initial distribution functions
+                - f_1: Copy of f_0 for double-buffering
+                - bc_mask: Boundary condition mask indicating which BC applies to each node
+                - missing_mask: Mask indicating which populations are missing at boundary nodes
+        """
+        # Create fields using the helper function
+        _, f_0, f_1, missing_mask, bc_mask = create_nse_fields(
+            grid=self.grid, velocity_set=self.velocity_set, compute_backend=self.compute_backend, precision_policy=self.precision_policy
+        )
+
+        # Initialize distribution functions if initializer is provided
+        if initializer is not None:
+            f_0 = initializer(self.grid, self.velocity_set, self.precision_policy, self.compute_backend)
+        else:
+            from xlb.helper.initializers import initialize_multires_eq
+
+            f_0 = initialize_multires_eq(f_0, self.grid, self.velocity_set, self.precision_policy, self.compute_backend)
+
+        if self.compute_backend == ComputeBackend.NEON:
+            for level in range(self.grid.count_levels):
+                f_1.copy_from_run(level, f_0, 0)
+
+        # Process boundary conditions and update masks
+        bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, bc_mask, missing_mask, xlb_grid=self.grid)
+        # Initialize auxiliary data if needed
+        f_0, f_1 = self._initialize_auxiliary_data(self.boundary_conditions, f_0, f_1, bc_mask, missing_mask)
+        bc_mask.update_host(0)
+        missing_mask.update_host(0)
+        wp.synchronize()
+        #bc_mask.export_vti("bc_mask.vti", 'bc_mask')
+        #missing_mask.export_vti("missing_mask.vti", 'missing_mask')
+
+        return f_0, f_1, bc_mask, missing_mask
+
+    @classmethod
+    def _process_boundary_conditions(cls, boundary_conditions, bc_mask, missing_mask, xlb_grid=None):
+        """Process boundary conditions and update boundary masks."""
+        # Check for boundary condition overlaps
+        check_bc_overlaps(boundary_conditions, DefaultConfig.velocity_set.d, DefaultConfig.default_backend)
+        # Create boundary maskers
+        indices_masker = IndicesBoundaryMasker(
+            velocity_set=DefaultConfig.velocity_set,
+            precision_policy=DefaultConfig.default_precision_policy,
+            compute_backend=DefaultConfig.default_backend,
+        )
+        # Split boundary conditions by type
+        bc_with_vertices = [bc for bc in boundary_conditions if bc.mesh_vertices is not None]
+        bc_with_indices = [bc for bc in boundary_conditions if bc.indices is not None]
+        # Process indices-based boundary conditions
+        if bc_with_indices:
+            bc_mask, missing_mask = indices_masker(bc_with_indices, bc_mask, missing_mask, xlb_grid=xlb_grid)
+        # Process mesh-based boundary conditions for 3D
+        if DefaultConfig.velocity_set.d == 3 and bc_with_vertices:
+            mesh_masker = MeshBoundaryMasker(
+                velocity_set=DefaultConfig.velocity_set,
+                precision_policy=DefaultConfig.default_precision_policy,
+                compute_backend=DefaultConfig.default_backend,
+            )
+            for bc in bc_with_vertices:
+                bc_mask, missing_mask = mesh_masker(bc, bc_mask, missing_mask)
+
+        return bc_mask, missing_mask
+
+    @staticmethod
+    def _initialize_auxiliary_data(boundary_conditions, f_0, f_1, bc_mask, missing_mask):
+        """Initialize auxiliary data for boundary conditions that require it."""
+        for bc in boundary_conditions:
+            if bc.needs_aux_init and not bc.is_initialized_with_aux_data:
+                f_0, f_1 = bc.aux_data_init(f_0, f_1, bc_mask, missing_mask)
+        return f_0, f_1
+
+    def _construct_neon(self):
+        # Set local constants
+        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
+        _missing_mask_vec = wp.vec(self.velocity_set.q, dtype=wp.uint8)
+        _opp_indices = self.velocity_set.opp_indices
+        #_cast_to_store_dtype = self.store_dtype()
+
+        # Read the list of bc_to_id created upon instantiation
+        bc_to_id = boundary_condition_registry.bc_to_id
+        id_to_bc = boundary_condition_registry.id_to_bc
+
+        # Gather IDs of ExtrapolationOutflowBC boundary conditions
+        extrapolation_outflow_bc_ids = []
+        for bc_name, bc_id in bc_to_id.items():
+            if bc_name.startswith("ExtrapolationOutflowBC"):
+                extrapolation_outflow_bc_ids.append(bc_id)
+        # Group active boundary conditions
+        active_bcs = set(boundary_condition_registry.id_to_bc[bc.id] for bc in self.boundary_conditions)
+
+        @wp.func
+        def apply_bc(
+            index: Any,
+            timestep: Any,
+            _boundary_id: Any,
+            missing_mask: Any,
+            f_0: Any,
+            f_1: Any,
+            f_pre: Any,
+            f_post: Any,
+            is_post_streaming: bool,
+        ):
+            f_result = f_post
+
+            # Unroll the loop over boundary conditions
+            for i in range(wp.static(len(self.boundary_conditions))):
+                if is_post_streaming:
+                    if wp.static(self.boundary_conditions[i].implementation_step == ImplementationStep.STREAMING):
+                        if _boundary_id == wp.static(self.boundary_conditions[i].id):
+                            f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep, missing_mask, f_0, f_1, f_pre, f_post)
+                else:
+                    if wp.static(self.boundary_conditions[i].implementation_step == ImplementationStep.COLLISION):
+                        if _boundary_id == wp.static(self.boundary_conditions[i].id):
+                            f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep, missing_mask, f_0, f_1, f_pre, f_post)
+                    if wp.static(self.boundary_conditions[i].id in extrapolation_outflow_bc_ids):
+                        if _boundary_id == wp.static(self.boundary_conditions[i].id):
+                            f_result = wp.static(self.boundary_conditions[i].prepare_bc_auxilary_data)(
+                                index, timestep, missing_mask, f_0, f_1, f_pre, f_post
+                            )
+            return f_result
+
+        @wp.func
+        def neon_get_thread_data(
+            f0_pn: Any,
+            f1_pn: Any,
+            missing_mask_pn: Any,
+            index: Any,
+        ):
+            # Read thread data for populations
+            _f0_thread = _f_vec()
+            _f1_thread = _f_vec()
+            _missing_mask = _missing_mask_vec()
+            for l in range(self.velocity_set.q):
+                # q-sized vector of pre-streaming populations
+                _f0_thread[l] = self.compute_dtype(wp.neon_read(f0_pn, index, l))
+                _f1_thread[l] = self.compute_dtype(wp.neon_read(f1_pn, index, l))
+                _missing_mask[l] = wp.neon_read(missing_mask_pn, index, l)
+
+            return _f0_thread, _f1_thread, _missing_mask
+
+        import typing
+        @neon.Container.factory(name="nse_multires_stepper")
+        def container(
+                level: int,
+                f_0_fd: Any,
+                f_1_fd: Any,
+                bc_mask_fd: Any,
+                missing_mask_fd: Any,
+                omega: Any,
+                timestep: int,
+        ):
+            cast_to_store_dtype = self.store_dtype
+            def nse_stepper_ll(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+
+                f_0_pn=loader.get_mres_read_handle(f_0_fd)
+                bc_mask_pn=loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn=loader.get_mres_read_handle(missing_mask_fd)
+
+                f_1_pn =loader.get_mres_write_handle(f_1_fd)
+
+                @wp.func
+                def nse_stepper_cl(index: typing.Any):
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    if _boundary_id == wp.uint8(255):
+                        return
+                    # Apply streaming
+                    _f_post_stream = self.stream.neon_functional(f_0_pn, index)
+
+                    _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
+                    _f_post_collision = _f0_thread
+
+                    # Apply post-streaming boundary conditions
+                    _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
+
+                    _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
+                    _feq = self.equilibrium.neon_functional(_rho, _u)
+                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
+
+                    # Apply post-collision boundary conditions
+                    _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False)
+
+                    # Store the result in f_1
+                    for l in range(self.velocity_set.q):
+                        # TODO: Improve this later
+                        if wp.static("GradsApproximationBC" in active_bcs):
+                            if _boundary_id == wp.static(boundary_condition_registry.bc_to_id["GradsApproximationBC"]):
+                                if _missing_mask[l] == wp.uint8(1):
+                                    wp.neon_write(f_0_pn, index, _opp_indices[l], _f1_thread[_opp_indices[l]])
+                        wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+                loader.declare_kernel(nse_stepper_cl)
+            return nse_stepper_ll
+
+        return None, container
+
+    def get_containers(self, target_level,  f_0, f_1, bc_mask, missing_mask,  omega, timestep):
+        _, container = self._construct_neon()
+        even = container(target_level, f_0, f_1, bc_mask, missing_mask, omega, 0)
+        odd = container(target_level, f_1, f_0, bc_mask, missing_mask, omega, 1)
+        return {'even': even,
+                'odd':odd}
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_launch(self, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
+        #if self.c is None:
+        #    self.c = self.neon_container(f_0, f_1, bc_mask, missing_mask, timestep)
+        # c = None
+        # if self.odd_or_even == 'even':
+        #     c = self.c_even
+        # else:
+        #     c = self.c_odd
+        #
+        # if c is None:
+        #     pass
+        c = self.neon_container(f_0, f_1, bc_mask, missing_mask, omega, timestep)
+        c.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
+        #
+        # if self.odd_or_even == 'even':
+        #     c = self.c_even
+        # else:
+        #     c = self.c_odd
+        #
+        # if self.odd_or_even == 'even':
+        #     self.odd_or_even = 'odd'
+
+        return f_0, f_1

From 6bafbc4e22d3b1e8aecbb9a0a585fea5c4fcfc9e Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 6 Mar 2025 11:48:31 +0100
Subject: [PATCH 012/208] WIP

---
 .../performance/mlups_3d_multires_solver.py   | 182 ++++++++++++++++++
 xlb/helper/nse_multires_solver.py             |  20 +-
 2 files changed, 192 insertions(+), 10 deletions(-)
 create mode 100644 examples/performance/mlups_3d_multires_solver.py

diff --git a/examples/performance/mlups_3d_multires_solver.py b/examples/performance/mlups_3d_multires_solver.py
new file mode 100644
index 00000000..6afbaaa2
--- /dev/null
+++ b/examples/performance/mlups_3d_multires_solver.py
@@ -0,0 +1,182 @@
+import xlb
+import argparse
+import time
+import warp as wp
+import numpy as np
+
+# add a directory to the PYTHON PATH
+import sys
+# sys.path.append('/home/max/repos/neon/warping/neon_warp_testing/neon_py_bindings/py/')
+import neon
+
+from xlb.compute_backend import ComputeBackend
+from xlb.precision_policy import PrecisionPolicy
+from xlb.grid import multires_grid_factory
+from xlb.operator.stepper import MultiresIncompressibleNavierStokesStepper
+from xlb.operator.boundary_condition import FullwayBounceBackBC, EquilibriumBC
+from xlb.distribute import distribute
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation (BGK)")
+    # Positional arguments
+    parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid")
+    parser.add_argument("num_steps", type=int, help="Timestep for the simulation")
+    parser.add_argument("backend", type=str, help="Backend for the simulation (jax, warp or neon)")
+    parser.add_argument("precision", type=str, help="Precision for the simulation (e.g., fp32/fp32)")
+
+    # Optional arguments
+    parser.add_argument("--num_devices", type=int, default=0, help="Number of devices for the simulation (default: 0)")
+    parser.add_argument("--velocity_set", type=str, default='D3Q19',
+                        help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)"
+                        )
+
+    return parser.parse_args()
+
+
+def setup_simulation(args):
+    backend = None
+    if args.backend == "jax": backend = ComputeBackend.JAX
+    elif args.backend == "warp": backend = ComputeBackend.WARP
+    elif args.backend == "neon": backend = ComputeBackend.NEON
+    if backend is None:
+        raise ValueError("Invalid backend")
+
+    precision_policy_map = {
+        "fp32/fp32": PrecisionPolicy.FP32FP32,
+        "fp64/fp64": PrecisionPolicy.FP64FP64,
+        "fp64/fp32": PrecisionPolicy.FP64FP32,
+        "fp32/fp16": PrecisionPolicy.FP32FP16,
+    }
+    precision_policy = precision_policy_map.get(args.precision)
+    if precision_policy is None:
+        raise ValueError("Invalid precision")
+
+    velocity_set = None
+    if args.velocity_set == 'D3Q19': velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
+    elif args.velocity_set == 'D3Q27': velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, backend=backend)
+    if velocity_set is None:
+        raise ValueError("Invalid velocity set")
+
+    xlb.init(
+        velocity_set=velocity_set,
+        default_backend=backend,
+        default_precision_policy=precision_policy,
+    )
+
+    return backend, precision_policy
+
+
+def run(backend, precision_policy, grid_shape, num_steps):
+    # Create grid and setup boundary conditions
+    velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
+
+    dim = neon.Index_3d(grid_shape[0],
+                        grid_shape[1],
+                        grid_shape[2])
+    level_zero_mask = np.ones((dim.x, dim.y, dim.z), dtype=int)
+    level_zero_mask = np.ascontiguousarray(level_zero_mask, dtype=np.int32)
+    #
+    # level_one_mask = np.zeros((2, 2, 2), dtype=int)
+    # level_one_mask[0, 0, 0] = 1
+    # level_one_mask[0, 0, 1] = 0
+    # level_one_mask[0, 1, 0] = 0
+    # level_one_mask[1, 1, 1] = 1
+    #
+    # grid = neon.mGrid(bk, dim,
+    #                   sparsity_pattern_list=[
+    #                       np.ascontiguousarray(maskZero, dtype=np.int32),
+    #                       np.ascontiguousarray(maskOne, dtype=np.int32),
+    #                   ],
+    #                   sparsity_pattern_origins=[neon.Index_3d(0, 0, 0),
+    #                                             neon.Index_3d(0, 0, 0)],
+    #                   stencil=[[0, 0, 0], [1, 0, 0]], )
+
+    grid = multires_grid_factory(grid_shape, velocity_set=velocity_set,
+                                 sparsity_pattern_list=[level_zero_mask],
+                                 sparsity_pattern_origins=[neon.Index_3d(0, 0, 0)])
+    box = grid.bounding_box_indices()
+    box_no_edge = grid.bounding_box_indices(remove_edges=True)
+    lid = box_no_edge["top"]
+    walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in range(len(grid.shape))]
+    walls = np.unique(np.array(walls), axis=-1).tolist()
+
+    prescribed_vel = 0.05
+
+    boundary_conditions = [EquilibriumBC(rho=1.0, u=(prescribed_vel, 0.0, 0.0), indices=lid), FullwayBounceBackBC(indices=walls)]
+
+    # Create stepper
+    stepper = MultiresIncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="BGK")
+
+    Re = 10000.0
+    clength = grid_shape[0] - 1
+    visc = prescribed_vel * clength / Re
+    omega = 1.0 / (3.0 * visc + 0.5)
+
+    # # Initialize fields and run simulation
+    # omega = 1.0
+
+    sim = xlb.helper.Nse_multires_simulation(grid, velocity_set, stepper, omega)
+    print("start timing")
+    start_time = time.time()
+
+    for i in range(num_steps):
+        print(f"step {i}")
+        sim.step()
+        if i%500 == 0:
+            sim.export_macroscopic("u_lid_driven_cavity_")
+    wp.synchronize()
+    t = time.time() - start_time
+
+    sim.export_macroscopic("u_lid_driven_cavity_")
+    return t
+
+
+def calculate_mlups(cube_edge, num_steps, elapsed_time):
+    total_lattice_updates = cube_edge**3 * num_steps
+    mlups = (total_lattice_updates / elapsed_time) / 1e6
+    return mlups
+
+def post_process(macro, rho, u, f_0,  i):
+    # Write the results. We'll use JAX backend for the post-processing
+    # import jax.numpy as jnp
+    # if not isinstance(f_0, jnp.ndarray):
+    #     # If the backend is warp, we need to drop the last dimension added by warp for 2D simulations
+    #     f_0 = wp.to_jax(f_0)[..., 0]
+    # else:
+    #     f_0 = f_0
+    rho, u = macro(f_0, rho, u )
+    wp.synchronize()
+    u.update_host(0)
+    rho.update_host(0)
+    wp.synchronize()
+    u.export_vti(f"u_lid_driven_cavity_{i}.vti", 'u')
+    rho.export_vti(f"rho_lid_driven_cavity_{i}.vti", 'rho')
+
+    pass
+
+    # # remove boundary cells
+    # rho = rho[:, 1:-1, 1:-1, 1:-1]
+    # u = u[:, 1:-1, 1:-1, 1:-1]
+    # u_magnitude = (u[0] ** 2 + u[1] ** 2) ** 0.5
+    #
+    # fields = {"rho": rho[0], "u_x": u[0], "u_y": u[1], "u_magnitude": u_magnitude}
+    #
+    # # save_fields_vtk(fields, timestep=i, prefix="lid_driven_cavity")
+    # ny=fields["u_magnitude"].shape[1]
+    # from xlb.utils import  save_image
+    # save_image(fields["u_magnitude"][:, ny//2, :], timestep=i, prefix="lid_driven_cavity")
+
+def main():
+
+    args = parse_arguments()
+    backend, precision_policy = setup_simulation(args)
+    grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
+    elapsed_time = run(backend, precision_policy, grid_shape, args.num_steps)
+    mlups = calculate_mlups(args.cube_edge, args.num_steps, elapsed_time)
+
+    print(f"Simulation completed in {elapsed_time:.2f} seconds")
+    print(f"MLUPs: {mlups:.2f}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/xlb/helper/nse_multires_solver.py b/xlb/helper/nse_multires_solver.py
index ebbc1b03..296f5fd2 100644
--- a/xlb/helper/nse_multires_solver.py
+++ b/xlb/helper/nse_multires_solver.py
@@ -64,16 +64,16 @@ def __init_containers(self, num_levels):
         self.skeleton_odd.sequence(name="odd lbm", containers=[self.odd_step])
 
     def export_macroscopic(self, fname_prefix):
-        if self.iteration_idx % 2 == 0:
-            self.even_macroscopic.run(0)
-        else:
-            self.odd_macroscopic.run(0)
-
-        import warp as wp
-        wp.synchronize()
-        self.u.update_host(0)
-        wp.synchronize()
-        self.u.export_vti(f"{fname_prefix}{self.iteration_idx}.vti", 'u')
+        # if self.iteration_idx % 2 == 0:
+        #     self.even_macroscopic.run(0)
+        # else:
+        #     self.odd_macroscopic.run(0)
+        #
+        # import warp as wp
+        # wp.synchronize()
+        # self.u.update_host(0)
+        # wp.synchronize()
+        # self.u.export_vti(f"{fname_prefix}{self.iteration_idx}.vti", 'u')
 
         return
 

From e0229de06627eae435ee9305ddcb0edf998a7ad1 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Mon, 31 Mar 2025 10:16:26 +0200
Subject: [PATCH 013/208] WIP

---
 .gitignore                                    |   3 +-
 .../performance/mlups_3d_multires_solver.py   |  11 +-
 xlb/helper/nse_multires_solver.py             |  10 +-
 xlb/operator/stepper/nse_multires_stepper.py  | 229 ++++++++++++++++--
 4 files changed, 225 insertions(+), 28 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3bbd875a..aecc7a13 100644
--- a/.gitignore
+++ b/.gitignore
@@ -154,4 +154,5 @@ checkpoints/*
 # Ignore Python packaging build directories
 dist/
 build/
-*.egg-info/
\ No newline at end of file
+*.egg-info/
+*.dot
diff --git a/examples/performance/mlups_3d_multires_solver.py b/examples/performance/mlups_3d_multires_solver.py
index 6afbaaa2..aaa17f37 100644
--- a/examples/performance/mlups_3d_multires_solver.py
+++ b/examples/performance/mlups_3d_multires_solver.py
@@ -73,8 +73,12 @@ def run(backend, precision_policy, grid_shape, num_steps):
     dim = neon.Index_3d(grid_shape[0],
                         grid_shape[1],
                         grid_shape[2])
-    level_zero_mask = np.ones((dim.x, dim.y, dim.z), dtype=int)
+    level_zero_mask = np.ones((dim.x//2, dim.y, dim.z), dtype=int)
     level_zero_mask = np.ascontiguousarray(level_zero_mask, dtype=np.int32)
+
+    level_one_mask = np.ones((dim.x//2, dim.y, dim.z), dtype=int)
+    level_one_mask = np.ascontiguousarray(level_one_mask, dtype=np.int32)
+
     #
     # level_one_mask = np.zeros((2, 2, 2), dtype=int)
     # level_one_mask[0, 0, 0] = 1
@@ -92,8 +96,9 @@ def run(backend, precision_policy, grid_shape, num_steps):
     #                   stencil=[[0, 0, 0], [1, 0, 0]], )
 
     grid = multires_grid_factory(grid_shape, velocity_set=velocity_set,
-                                 sparsity_pattern_list=[level_zero_mask],
-                                 sparsity_pattern_origins=[neon.Index_3d(0, 0, 0)])
+                                 sparsity_pattern_list=[level_one_mask, level_zero_mask, ],
+                                 sparsity_pattern_origins=[ neon.Index_3d(dim.x//2+1, 0, 0), neon.Index_3d(0, 0, 0),])
+
     box = grid.bounding_box_indices()
     box_no_edge = grid.bounding_box_indices(remove_edges=True)
     lid = box_no_edge["top"]
diff --git a/xlb/helper/nse_multires_solver.py b/xlb/helper/nse_multires_solver.py
index 296f5fd2..1ebac2a8 100644
--- a/xlb/helper/nse_multires_solver.py
+++ b/xlb/helper/nse_multires_solver.py
@@ -5,7 +5,7 @@
 from xlb.precision_policy import Precision
 from typing import Tuple, List
 import neon
-
+import warp as wp
 
 class Nse_multires_simulation:
     def __init__(self, grid, velocity_set, stepper, omega):
@@ -25,6 +25,14 @@ def __init__(self, grid, velocity_set, stepper, omega):
         self.rho = grid.create_field(cardinality=1, dtype=self.precision_policy.store_precision)
         self.u = grid.create_field(cardinality=3, dtype=self.precision_policy.store_precision)
 
+        fname_prefix='test'
+        self.rho.fill_run(0, 0.0, 0)
+        self.rho.fill_run(0, 1.0, 0)
+        wp.synchronize()
+        self.rho.update_host(0)
+        wp.synchronize()
+        self.rho.export_vti(f"{fname_prefix}_topology.vti", 'u')
+
         self.odd_step = None
         self.even_step = None
         self.iteration_idx = -1
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index aa19f82c..d0516d54 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -145,7 +145,7 @@ def _construct_neon(self):
         # Read the list of bc_to_id created upon instantiation
         bc_to_id = boundary_condition_registry.bc_to_id
         id_to_bc = boundary_condition_registry.id_to_bc
-
+        _zero = self.compute_dtype(0)
         # Gather IDs of ExtrapolationOutflowBC boundary conditions
         extrapolation_outflow_bc_ids = []
         for bc_name, bc_id in bc_to_id.items():
@@ -205,8 +205,8 @@ def neon_get_thread_data(
             return _f0_thread, _f1_thread, _missing_mask
 
         import typing
-        @neon.Container.factory(name="nse_multires_stepper")
-        def container(
+        @neon.Container.factory(name="finest_collide")
+        def single_step_finest(
                 level: int,
                 f_0_fd: Any,
                 f_1_fd: Any,
@@ -215,8 +215,14 @@ def container(
                 omega: Any,
                 timestep: int,
         ):
-            cast_to_store_dtype = self.store_dtype
-            def nse_stepper_ll(loader: neon.Loader):
+            # if level != 0:
+            #     # throw an exception
+            #     raise Exception("Only the finest level is supported for now")
+
+            # module op to define odd of even iteration
+            od_or_even = wp.module("odd_or_even", "even")
+
+            def ll_single_step_finest(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
                 f_0_pn=loader.get_mres_read_handle(f_0_fd)
@@ -226,18 +232,15 @@ def nse_stepper_ll(loader: neon.Loader):
                 f_1_pn =loader.get_mres_write_handle(f_1_fd)
 
                 @wp.func
-                def nse_stepper_cl(index: typing.Any):
+                def cl_single_step_finest(index: typing.Any):
+                    _c = self.velocity_set.c
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id == wp.uint8(255):
                         return
-                    # Apply streaming
-                    _f_post_stream = self.stream.neon_functional(f_0_pn, index)
 
+                    # Read thread data for populations, these are post streaming
                     _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
-                    _f_post_collision = _f0_thread
-
-                    # Apply post-streaming boundary conditions
-                    _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
+                    _f_post_stream = _f0_thread
 
                     _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
                     _feq = self.equilibrium.neon_functional(_rho, _u)
@@ -246,18 +249,198 @@ def nse_stepper_cl(index: typing.Any):
                     # Apply post-collision boundary conditions
                     _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False)
 
-                    # Store the result in f_1
+                    # Apply streaming boundary conditions
+                    _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, True)
+                    _opposite_c_idx = self.velocity_set.self.opp_indices
+
+
                     for l in range(self.velocity_set.q):
-                        # TODO: Improve this later
-                        if wp.static("GradsApproximationBC" in active_bcs):
-                            if _boundary_id == wp.static(boundary_condition_registry.bc_to_id["GradsApproximationBC"]):
-                                if _missing_mask[l] == wp.uint8(1):
-                                    wp.neon_write(f_0_pn, index, _opp_indices[l], _f1_thread[_opp_indices[l]])
-                        wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
-                loader.declare_kernel(nse_stepper_cl)
-            return nse_stepper_ll
-
-        return None, container
+                        push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]),
+                                                   wp.int8(_c[1, l]),
+                                                   wp.int8(_c[2, l]))
+                        ## Store
+                        if od_or_even == 0:
+                            wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, _f_post_stream[l])
+                        else:
+                            wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction,_f_post_stream[l])
+
+                        ## Push stream
+                        is_active = wp.neon_is_active(f_0_pn, index, push_direction)
+                        if is_active:
+                            ngh_gidx = wp.neon_ngh_idx(f_0_pn, index, push_direction)
+                            ngh_boundary_id = wp.neon_read(bc_mask_pn, ngh_gidx, 0)
+                            ## WHAT IS BULK?
+                            if ngh_boundary_id == BULK:
+                                wp.neon_write(f_1_pn, ngh_gidx, l, _f_post_stream[l])
+                            else:
+                                opposite_l = _opp_indices[l]
+                                wp.neon_write(f_1_pn, index, opposite_l, _f_post_stream[l])
+                        else:
+                            if wp.int8(_c[0, l]) != 0 and wp.int8(_c[1, l]) != 0 and wp.int8(_c[2, l]) != 0:
+                                opposite_l = _opp_indices[l]
+                                is_valid = False
+                                value = self.compute_dtype(0)
+                                if od_or_even == 0:
+                                    value = wp.neon_uncle_read(f_1_pn, index, push_direction, opposite_l, value, is_valid)
+                                else:
+                                    value = wp.neon_uncle_read(f_0_pn, index, push_direction, opposite_l, value, is_valid)
+                                if is_valid:
+                                    wp.neon_write(f_1_pn, index, l, _f_post_stream[l], value)
+
+
+                loader.declare_kernel(cl_single_step_finest)
+            return ll_single_step_finest
+
+
+        @neon.Container.factory(name="collide_coarse")
+        def collide_coarse(
+                level: int,
+                f_0_fd: Any,
+                f_1_fd: Any,
+                bc_mask_fd: Any,
+                missing_mask_fd: Any,
+                omega: Any,
+                timestep: int,
+        ):
+            num_levels = f_0_fd.get_grid().get_num_levels()
+            if level != 0:
+                # throw an exception
+                raise Exception("Only the finest level is supported for now")
+
+            # module op to define odd of even iteration
+            od_or_even = wp.module("odd_or_even", "even")
+
+            def ll_collide_coarse(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+
+                f_0_pn=loader.get_mres_read_handle(f_0_fd)
+                bc_mask_pn=loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn=loader.get_mres_read_handle(missing_mask_fd)
+
+                f_1_pn =loader.get_mres_write_handle(f_1_fd)
+
+                @wp.func
+                def cl_collide_coarse(index: typing.Any):
+                    _c = self.velocity_set.c
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    """
+                    The c++ version starts with the following, which I am not sure is right:
+                        if (type(cell, 0) == CellType::bulk ) {
+                    CB type cells should do collide too  
+                    """
+                    if _boundary_id == wp.uint8(255):
+                        return
+
+                    if not wp.neon_has_children(f_0_pn, index):
+
+                        # Read thread data for populations, these are post streaming
+                        _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
+                        _f_post_stream = _f0_thread
+
+                        _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
+                        _feq = self.equilibrium.neon_functional(_rho, _u)
+                        _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
+
+                        # Apply post-collision boundary conditions
+                        _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False)
+
+                        for l in range(self.velocity_set.q):
+                            push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
+                            if(level < num_levels - 1):
+                                ## Store
+                                if od_or_even == 0:
+                                    wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, _f_post_collision[l])
+                                else:
+                                    wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, _f_post_collision[l])
+
+                            wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+                    else:
+                        for l in range(self.velocity_set.q):
+                            wp.neon_write(f_1_pn, index, l, self.compute_dtype(0))
+
+
+
+                loader.declare_kernel(cl_collide_coarse)
+            return ll_collide_coarse
+
+        @neon.Container.factory(name="stream_coarse")
+        def stream_coarse(
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+            timestep: int,
+        ):
+            num_levels = f_0_fd.get_grid().get_num_levels()
+            # if level != 0:
+            #     # throw an exception
+            #     raise Exception("Only the finest level is supported for now")
+
+            # module op to define odd of even iteration
+            od_or_even = wp.module("odd_or_even", "even")
+
+            def ll_stream_coarse(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+
+                f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
+
+                f_1_pn = loader.get_mres_write_handle(f_1_fd)
+
+                @wp.func
+                def cl_stream_coarse(index: typing.Any):
+                    _c = self.velocity_set.c
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    if _boundary_id != wp.uint8(255):
+                        if not wp.neon_has_children(f_0_pn, index):
+                            # do stream normally
+                            _f_post_stream = self.stream.warp_functional(f_0, index)
+
+                            # do mres corrections
+                            for l in range(self.velocity_set.q):
+                                pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
+                                if wp.neon_hasChildren(bc_mask_pn, index, pull_direction):
+                                    is_valid = wp.bool(False)
+                                    read_accumulate_date = wp.neon_ngh_data(bc_mask_pn, index, pull_direction, l, is_valid)
+                                    if is_valid:
+                                        _f_post_stream[l] = read_accumulate_date * this.compute_dtype(0.5)
+
+                            # do non mres post-streaming corrections
+                            _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0, f_1, _f_post_collision, _f_post_stream, True)
+
+
+
+                            if level < num_levels - 1:
+                                ## Store
+                                if od_or_even == 0:
+                                    wp.neon_mres_lbm_store_op(
+                                        f_0_pn,
+                                        index,
+                                        l,
+                                        push_direction,
+                                        _f_post_collision[l],
+                                    )
+                                else:
+                                    wp.neon_mres_lbm_store_op(
+                                        f_1_pn,
+                                        index,
+                                        l,
+                                        push_direction,
+                                        _f_post_collision[l],
+                                    )
+
+                                wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+
+                loader.declare_kernel(cl_collide_coarse)
+
+            return ll_collide_coarse
+
+        return None, {"single_step_finest": single_step_finest, "collide_coarse": collide_coarse, "stream_coarse": stream_coarse}
+
+
 
     def get_containers(self, target_level,  f_0, f_1, bc_mask, missing_mask,  omega, timestep):
         _, container = self._construct_neon()

From dd77618d6dd2cb3e937ec999285dbfe2d7be6ee0 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Mon, 7 Apr 2025 19:17:41 +0200
Subject: [PATCH 014/208] WIP

---
 .../performance/mlups_3d_multires_solver.py   | 76 +++++++++++-----
 xlb/helper/nse_multires_solver.py             | 87 ++++++++++++-------
 .../mulltires_quadratic_equilibrium.py        |  4 +
 xlb/operator/macroscopic/macroscopic.py       | 62 +++++++++++++
 .../macroscopic/multires_macroscopic.py       | 41 +++++++--
 xlb/operator/stepper/nse_multires_stepper.py  | 82 ++++++++---------
 6 files changed, 247 insertions(+), 105 deletions(-)

diff --git a/examples/performance/mlups_3d_multires_solver.py b/examples/performance/mlups_3d_multires_solver.py
index aaa17f37..421ebeb5 100644
--- a/examples/performance/mlups_3d_multires_solver.py
+++ b/examples/performance/mlups_3d_multires_solver.py
@@ -73,31 +73,62 @@ def run(backend, precision_policy, grid_shape, num_steps):
     dim = neon.Index_3d(grid_shape[0],
                         grid_shape[1],
                         grid_shape[2])
-    level_zero_mask = np.ones((dim.x//2, dim.y, dim.z), dtype=int)
+    level_zero_mask = np.zeros((dim.x, dim.y, dim.z), dtype=int)
     level_zero_mask = np.ascontiguousarray(level_zero_mask, dtype=np.int32)
+    # loop over all the elements in level_zero_mask and set to one any that have x=0 or y=0 or z=0
+    for i in range(dim.x):
+        for j in range(dim.y):
+            for k in range(dim.z):
+                if i == 0 or j == 0 or k == 0:
+                    level_zero_mask[i, j, k] = 1
+                if i == dim.x-1 or j == dim.y-1 or k == dim.z-1:
+                    level_zero_mask[i, j, k] = 1
+                if i == 1 or j == 1 or k == 1:
+                    level_zero_mask[i, j, k] = 1
+                if i == dim.x-2 or j == dim.y-2 or k == dim.z-2:
+                    level_zero_mask[i, j, k] = 1
+                if (i == 2 or j == 2 or k == 2):
+                    level_zero_mask[i, j, k] = 1
+                if i == dim.x-3 or j == dim.y-3 or k == dim.z-3:
+                    level_zero_mask[i, j, k] = 1
+                if i == 3 or j == 3 or k == 3:
+                    level_zero_mask[i, j, k] = 1
+                if i == dim.x-4 or j == dim.y-4 or k == dim.z-4:
+                    level_zero_mask[i, j, k] = 1
+
+
+
+    level_one_mask = np.ones((dim.x//2, dim.y//2, dim.z//2), dtype=int)
+    m = neon.Index_3d(dim.x // 2, dim.y // 2, dim.z // 2)
+    # level_one_mask[0, 0, 0] = 1
+    # # level_one_mask[1, 0, 0] = 1
+    # # level_one_mask[2, 0, 0] = 1
+    # # level_one_mask[2, 0, 0] = 1
+    # # level_one_mask[m.x-3, 0, 0] = 1
+    # # level_one_mask[m.x-2, 0, 0] = 1
+    # # level_one_mask[m.x-1, 0, 0] = 1
+
+    for i in range(dim.x//2):
+        for j in range(dim.y//2):
+            for k in range(dim.z//2):
+                m = neon.Index_3d(dim.x//2,
+                                  dim.y//2,
+                                  dim.z//2)
+                if i == 0 or j == 0 or k == 0:
+                    level_one_mask[i, j, k] = 0
+                if i == m.x-1 or j == m.y-1 or k == m.z-1:
+                    level_one_mask[i, j, k] = 0
+                if i == 1 or j == 1 or k == 1:
+                    level_one_mask[i, j, k] = 0
+                if (i == m.x-2 or j == m.y-2 or k == m.z-2):
+                    level_one_mask[i, j, k] = 0
 
-    level_one_mask = np.ones((dim.x//2, dim.y, dim.z), dtype=int)
     level_one_mask = np.ascontiguousarray(level_one_mask, dtype=np.int32)
 
-    #
-    # level_one_mask = np.zeros((2, 2, 2), dtype=int)
-    # level_one_mask[0, 0, 0] = 1
-    # level_one_mask[0, 0, 1] = 0
-    # level_one_mask[0, 1, 0] = 0
-    # level_one_mask[1, 1, 1] = 1
-    #
-    # grid = neon.mGrid(bk, dim,
-    #                   sparsity_pattern_list=[
-    #                       np.ascontiguousarray(maskZero, dtype=np.int32),
-    #                       np.ascontiguousarray(maskOne, dtype=np.int32),
-    #                   ],
-    #                   sparsity_pattern_origins=[neon.Index_3d(0, 0, 0),
-    #                                             neon.Index_3d(0, 0, 0)],
-    #                   stencil=[[0, 0, 0], [1, 0, 0]], )
-
     grid = multires_grid_factory(grid_shape, velocity_set=velocity_set,
-                                 sparsity_pattern_list=[level_one_mask, level_zero_mask, ],
-                                 sparsity_pattern_origins=[ neon.Index_3d(dim.x//2+1, 0, 0), neon.Index_3d(0, 0, 0),])
+                                 sparsity_pattern_list=[ level_zero_mask,level_one_mask ,],
+                                 sparsity_pattern_origins=[ neon.Index_3d(0, 0, 0),
+                                                            neon.Index_3d(0, 0, 0),])
 
     box = grid.bounding_box_indices()
     box_no_edge = grid.bounding_box_indices(remove_edges=True)
@@ -121,13 +152,16 @@ def run(backend, precision_policy, grid_shape, num_steps):
     # omega = 1.0
 
     sim = xlb.helper.Nse_multires_simulation(grid, velocity_set, stepper, omega)
+
+    sim.export_macroscopic("Initial_")
+
     print("start timing")
     start_time = time.time()
 
     for i in range(num_steps):
         print(f"step {i}")
         sim.step()
-        if i%500 == 0:
+        if i%1 == 0:
             sim.export_macroscopic("u_lid_driven_cavity_")
     wp.synchronize()
     t = time.time() - start_time
diff --git a/xlb/helper/nse_multires_solver.py b/xlb/helper/nse_multires_solver.py
index 1ebac2a8..0c52ab2a 100644
--- a/xlb/helper/nse_multires_solver.py
+++ b/xlb/helper/nse_multires_solver.py
@@ -48,46 +48,71 @@ def __init__(self, grid, velocity_set, stepper, omega):
 
     def __init_containers(self, num_levels):
         # working only with level 0 for now
-        target_level = 0
-        containers = self.stepper.get_containers(target_level,
-                                                 self.f_0,
-                                                 self.f_1,
-                                                 self.bc_mask,
-                                                 self.missing_mask,
-                                                 self.omega,
-                                                 self.iteration_idx)
-
-        self.even_step = containers['even']
-        self.odd_step = containers['odd']
+        self.containers = {}
+        for target_level in range(num_levels):
+            self.containers[f"{target_level}"] = self.stepper.get_containers(target_level,
+                                                     self.f_0,
+                                                     self.f_1,
+                                                     self.bc_mask,
+                                                     self.missing_mask,
+                                                     self.omega,
+                                                     self.iteration_idx)
+            pass
+
+        # self.even_step = containers['even']
+        # self.odd_step = containers['odd']
+        #
+        self.macroscopics = {}
 
-        containers = self.macro.get_containers(target_level, self.f_0, self.f_1,self.rho, self.u)
+        for target_level in range(num_levels):
+            self.macroscopics[f"{target_level}"] = self.macro.get_containers(target_level, self.f_0, self.f_1, self.bc_mask, self.rho, self.u)
 
-        self.even_macroscopic = containers['even']
-        self.odd_macroscopic = containers['odd']
+        #
+        # # self.skeleton_even = neon.Skeleton(self.grid.get_neon_backend())
+        # # self.skeleton_odd = neon.Skeleton(self.grid.get_neon_backend())
+        # #
+        # # self.skeleton_even.sequence(name="even lbm", containers=[self.even_step])
+        # # self.skeleton_odd.sequence(name="odd lbm", containers=[self.odd_step])
 
-        self.skeleton_even = neon.Skeleton(self.grid.get_neon_backend())
-        self.skeleton_odd = neon.Skeleton(self.grid.get_neon_backend())
+    def export_macroscopic(self, fname_prefix):
+        print("exporting macroscopic")
+        for target_level in range(self.grid.count_levels):
+            if self.iteration_idx % 2 == 0:
+                self.macroscopics[f"{target_level}"]['even'][0].run(0)
+            else:
+                self.macroscopics[f"{target_level}"]['odd'][0].run(0)
 
-        self.skeleton_even.sequence(name="even lbm", containers=[self.even_step])
-        self.skeleton_odd.sequence(name="odd lbm", containers=[self.odd_step])
 
-    def export_macroscopic(self, fname_prefix):
-        # if self.iteration_idx % 2 == 0:
-        #     self.even_macroscopic.run(0)
-        # else:
-        #     self.odd_macroscopic.run(0)
-        #
-        # import warp as wp
-        # wp.synchronize()
-        # self.u.update_host(0)
-        # wp.synchronize()
-        # self.u.export_vti(f"{fname_prefix}{self.iteration_idx}.vti", 'u')
+        import warp as wp
+        wp.synchronize()
+        self.u.update_host(0)
+        wp.synchronize()
+        self.u.export_vti(f"{fname_prefix}{self.iteration_idx}.vti", 'u')
+        print("DONE exporting macroscopic")
 
         return
 
+    # one step at the corase level
     def step(self):
         self.iteration_idx += 1
+
         if self.iteration_idx % 2 == 0:
-            self.even_step.run(0)
+            self.containers["1"]["even"]['collide_coarse'].run(0)
+            wp.synchronize()
+            self.containers["0"]["even"]['collide_coarse'].run(0)
+            wp.synchronize()
+            self.containers["0"]["even"]['stream_coarse'].run(0)
+            wp.synchronize()
+            self.containers["0"]["odd"]['collide_coarse'].run(0)
+            wp.synchronize()
+            self.containers["0"]["odd"]['stream_coarse'].run(0)
+            wp.synchronize()
+            self.containers["1"]["even"]['stream_coarse'].run(0)
+            wp.synchronize()
         else:
-            self.odd_step.run(0)
+            self.containers["1"]["odd"]["collide_coarse"].run(0)
+            self.containers["0"]["even"]["collide_coarse"].run(0)
+            self.containers["0"]["even"]["stream_coarse"].run(0)
+            self.containers["0"]["odd"]["collide_coarse"].run(0)
+            self.containers["0"]["odd"]["stream_coarse"].run(0)
+            self.containers["1"]["odd"]["stream_coarse"].run(0)
diff --git a/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py b/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
index fce7ae1a..20974f5e 100644
--- a/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
+++ b/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
@@ -85,6 +85,10 @@ def quadratic_equilibrium_cl(index: typing.Any):
                     for l in range(self.velocity_set.q):
                         #wp.neon_write(f_pn, index, l, self.store_dtype(feq[l]))
                         wp.neon_write(f_pn, index, l, feq[l])
+                    if wp.neon_has_children(f_pn, index):
+                        for l in range(self.velocity_set.q):
+                            zero_val = self.compute_dtype(0.0)
+                            wp.neon_write(f_pn, index, l, zero_val)
                 loader.declare_kernel(quadratic_equilibrium_cl)
             return quadratic_equilibrium_ll
         return functional, container
diff --git a/xlb/operator/macroscopic/macroscopic.py b/xlb/operator/macroscopic/macroscopic.py
index 7b046fbf..b1927b24 100644
--- a/xlb/operator/macroscopic/macroscopic.py
+++ b/xlb/operator/macroscopic/macroscopic.py
@@ -108,11 +108,73 @@ def macroscopic_cl(gIdx: typing.Any):
             return macroscopic_ll
         return functional, container
 
+
+    def _construct_neon_visual(self):
+        zero_moment_func = self.zero_moment.neon_functional
+        first_moment_func = self.first_moment.neon_functional
+        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
+
+        @wp.func
+        def functional(f: _f_vec):
+            rho = zero_moment_func(f)
+            u = first_moment_func(f, rho)
+            return rho, u
+
+
+        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
+
+        import neon, typing
+        @neon.Container.factory("macroscopic")
+        def container(
+                f_field: Any,
+                bc_mask: Any,
+                rho_field: Any,
+                u_fild: Any,
+        ):
+            _d = self.velocity_set.d
+            def macroscopic_ll(loader: neon.Loader):
+                loader.set_grid(f_field.get_grid())
+
+                rho=loader.get_read_handle(rho_field)
+                u =loader.get_read_handle(u_fild)
+                f=loader.get_read_handle(f_field)
+                bc_mask_pn = loader.get_read_handle(bc_mask)
+                @wp.func
+                def macroscopic_cl(gIdx: typing.Any):
+                    _f = _f_vec()
+                    _boundary_id = wp.neon_read(bc_mask_pn, gIdx, 0)
+
+                    for l in range(self.velocity_set.q):
+                        _f[l] = wp.neon_read(f, gIdx,l)
+                    _rho, _u = functional(_f)
+                    if _boundary_id != wp.uint8(0):
+                        _rho = self.compute_dtype(1.0)
+                        for d in range(_d):
+                            _u[d] = self.compute_dtype(0.0)
+                    if _boundary_id == wp.uint8(255):
+                        _rho = self.compute_dtype(0.0)
+                        for d in range(_d):
+                            _u[d] = self.compute_dtype(0.0)
+
+                    wp.neon_write(rho, gIdx, 0, _rho)
+                    for d in range(_d):
+                        wp.neon_write(u, gIdx, d, _u[d])
+
+                loader.declare_kernel(macroscopic_cl)
+            return macroscopic_ll
+        return functional, container
+
+
     def get_containers(self, f_0, f_1, rho, u):
         _, container = self._construct_neon()
         return {'even': container(f_0,   rho, u),
                 'odd': container(f_1,  rho, u)}
 
+    def get_containers_visual(self, f_0, f_1, bc_mask, rho, u):
+        _, container = self._construct_neon()
+        return {'even': container(f_0,  bc_mask, rho, u),
+                'odd': container(f_1, bc_mask,  rho, u)}
+
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, f, rho, u):
         c = self.neon_container(f, rho, u)
diff --git a/xlb/operator/macroscopic/multires_macroscopic.py b/xlb/operator/macroscopic/multires_macroscopic.py
index d79428cc..cc657903 100644
--- a/xlb/operator/macroscopic/multires_macroscopic.py
+++ b/xlb/operator/macroscopic/multires_macroscopic.py
@@ -76,10 +76,11 @@ def functional(f: _f_vec):
         import neon, typing
         @neon.Container.factory("macroscopic")
         def container(
-                level: int,
-                f_field: Any,
-                rho_field: Any,
-                u_fild: Any,
+            level: int,
+            f_field: Any,
+            bc_mask: Any,
+            rho_field: Any,
+            u_fild: Any,
         ):
             _d = self.velocity_set.d
             def macroscopic_ll(loader: neon.Loader):
@@ -88,27 +89,53 @@ def macroscopic_ll(loader: neon.Loader):
                 rho=loader.get_mres_read_handle(rho_field)
                 u =loader.get_mres_read_handle(u_fild)
                 f=loader.get_mres_write_handle(f_field)
+                bc_mask_pn = loader.get_mres_write_handle(bc_mask)
 
                 @wp.func
                 def macroscopic_cl(gIdx: typing.Any):
                     _f = _f_vec()
+                    _boundary_id = wp.neon_read(bc_mask_pn, gIdx, 0)
+
                     for l in range(self.velocity_set.q):
                         _f[l] = wp.neon_read(f, gIdx,l)
                     _rho, _u = functional(_f)
+                    if _boundary_id != wp.uint8(0):
+                        _rho = self.compute_dtype(1.0)
+                        for d in range(_d):
+                            _u[d] = self.compute_dtype(0.0)
+                    if _boundary_id == wp.uint8(255):
+                        _rho = self.compute_dtype(0.0)
+                        for d in range(_d):
+                            _u[d] = self.compute_dtype(0.0)
+
                     wp.neon_write(rho, gIdx, 0, _rho)
                     for d in range(_d):
                         wp.neon_write(u, gIdx, d, _u[d])
 
+                    if wp.neon_has_children(f, gIdx):
+                        offVal = self.compute_dtype(-33000.0)
+                        zero_val = self.compute_dtype(0.0)
+                        wp.neon_write(rho, gIdx, 0, zero_val)
+                        wp.neon_write(u, gIdx, 0, offVal)
+                        wp.neon_write(u, gIdx, 1, zero_val)
+                        wp.neon_write(u, gIdx, 2, zero_val)
+                    else:
+                        offVal = self.compute_dtype(+33000.0)
+                        zero_val = self.compute_dtype(0.0)
+                        wp.neon_write(rho, gIdx, 0, zero_val)
+                        wp.neon_write(u, gIdx, 0, offVal)
+                        wp.neon_write(u, gIdx, 1, zero_val)
+                        wp.neon_write(u, gIdx, 2, zero_val)
                 loader.declare_kernel(macroscopic_cl)
             return macroscopic_ll
         return functional, container
 
-    def get_containers(self, target_level, f_0, f_1, rho, u):
+    def get_containers(self, target_level, f_0, f_1, bc_mask, rho, u):
         _, container = self._construct_neon()
         evenList = []
         oddList = []
-        evenList.append(container(target_level, f_0,   rho, u))
-        oddList.append( container(target_level, f_1,  rho, u))
+        evenList.append(container(target_level, f_0, bc_mask,   rho, u))
+        oddList.append( container(target_level, f_1, bc_mask,  rho, u))
         return {'even':evenList ,
                 'odd':oddList }
 
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index d0516d54..bf412575 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -76,10 +76,11 @@ def prepare_fields(self, initializer=None):
 
         # Initialize distribution functions if initializer is provided
         if initializer is not None:
-            f_0 = initializer(self.grid, self.velocity_set, self.precision_policy, self.compute_backend)
+            # throw an exception because this option is not implemented yet
+            raise Exception("Initializer is not implemented yet")
+            #f_0 = initializer(self.grid, self.velocity_set, self.precision_policy, self.compute_backend)
         else:
             from xlb.helper.initializers import initialize_multires_eq
-
             f_0 = initialize_multires_eq(f_0, self.grid, self.velocity_set, self.precision_policy, self.compute_backend)
 
         if self.compute_backend == ComputeBackend.NEON:
@@ -93,7 +94,7 @@ def prepare_fields(self, initializer=None):
         bc_mask.update_host(0)
         missing_mask.update_host(0)
         wp.synchronize()
-        #bc_mask.export_vti("bc_mask.vti", 'bc_mask')
+        bc_mask.export_vti("bc_mask.vti", 'bc_mask')
         #missing_mask.export_vti("missing_mask.vti", 'missing_mask')
 
         return f_0, f_1, bc_mask, missing_mask
@@ -303,12 +304,9 @@ def collide_coarse(
                 timestep: int,
         ):
             num_levels = f_0_fd.get_grid().get_num_levels()
-            if level != 0:
-                # throw an exception
-                raise Exception("Only the finest level is supported for now")
 
             # module op to define odd of even iteration
-            od_or_even = wp.module("odd_or_even", "even")
+            od_or_even = wp.mod(timestep, 2)
 
             def ll_collide_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
@@ -316,12 +314,12 @@ def ll_collide_coarse(loader: neon.Loader):
                 f_0_pn=loader.get_mres_read_handle(f_0_fd)
                 bc_mask_pn=loader.get_mres_read_handle(bc_mask_fd)
                 missing_mask_pn=loader.get_mres_read_handle(missing_mask_fd)
-
                 f_1_pn =loader.get_mres_write_handle(f_1_fd)
 
+                _c = self.velocity_set.c
+
                 @wp.func
                 def cl_collide_coarse(index: typing.Any):
-                    _c = self.velocity_set.c
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     """
                     The c++ version starts with the following, which I am not sure is right:
@@ -358,6 +356,8 @@ def cl_collide_coarse(index: typing.Any):
                         for l in range(self.velocity_set.q):
                             wp.neon_write(f_1_pn, index, l, self.compute_dtype(0))
 
+                    wp.print("collide_coarse")
+
 
 
                 loader.declare_kernel(cl_collide_coarse)
@@ -379,75 +379,65 @@ def stream_coarse(
             #     raise Exception("Only the finest level is supported for now")
 
             # module op to define odd of even iteration
-            od_or_even = wp.module("odd_or_even", "even")
+            #od_or_even = wp.module("odd_or_even", "even")
 
             def ll_stream_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
                 f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                f_1_pn = loader.get_mres_write_handle(f_1_fd)
+
                 bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
                 missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
 
-                f_1_pn = loader.get_mres_write_handle(f_1_fd)
+                _c = self.velocity_set.c
 
                 @wp.func
                 def cl_stream_coarse(index: typing.Any):
-                    _c = self.velocity_set.c
+                    _missing_mask = _missing_mask_vec()
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id != wp.uint8(255):
                         if not wp.neon_has_children(f_0_pn, index):
                             # do stream normally
-                            _f_post_stream = self.stream.warp_functional(f_0, index)
+                            _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
+                            _f_post_stream = self.stream.neon_functional(f_0_pn, index)
 
                             # do mres corrections
                             for l in range(self.velocity_set.q):
                                 pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
-                                if wp.neon_hasChildren(bc_mask_pn, index, pull_direction):
+                                _missing_mask[l] = wp.neon_read(missing_mask_pn, index, l)
+                                if wp.neon_has_children(f_0_pn, index, pull_direction):
                                     is_valid = wp.bool(False)
-                                    read_accumulate_date = wp.neon_ngh_data(bc_mask_pn, index, pull_direction, l, is_valid)
+                                    read_accumulate_date = wp.neon_ngh_data(f_1_pn, index, pull_direction, l, self.compute_dtype(0),is_valid)
                                     if is_valid:
-                                        _f_post_stream[l] = read_accumulate_date * this.compute_dtype(0.5)
+                                        wp.print("read_accumulate_date")
+                                        _f_post_stream[l] = self.compute_dtype(33) #read_accumulate_date * self.compute_dtype(0.5)
 
                             # do non mres post-streaming corrections
-                            _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0, f_1, _f_post_collision, _f_post_stream, True)
-
-
-
-                            if level < num_levels - 1:
-                                ## Store
-                                if od_or_even == 0:
-                                    wp.neon_mres_lbm_store_op(
-                                        f_0_pn,
-                                        index,
-                                        l,
-                                        push_direction,
-                                        _f_post_collision[l],
-                                    )
-                                else:
-                                    wp.neon_mres_lbm_store_op(
-                                        f_1_pn,
-                                        index,
-                                        l,
-                                        push_direction,
-                                        _f_post_collision[l],
-                                    )
+                            _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_stream, True)
 
-                                wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+                            for l in range(self.velocity_set.q):
+                                wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
+                    wp.print("stream_coarse")
 
-                loader.declare_kernel(cl_collide_coarse)
+                loader.declare_kernel(cl_stream_coarse)
 
-            return ll_collide_coarse
+            return ll_stream_coarse
 
-        return None, {"single_step_finest": single_step_finest, "collide_coarse": collide_coarse, "stream_coarse": stream_coarse}
+        return None, {
+            #"single_step_finest": single_step_finest,
+            "collide_coarse": collide_coarse,
+            "stream_coarse": stream_coarse}
 
 
 
     def get_containers(self, target_level,  f_0, f_1, bc_mask, missing_mask,  omega, timestep):
+        containers = {'even': {}, 'odd': {}}
         _, container = self._construct_neon()
-        even = container(target_level, f_0, f_1, bc_mask, missing_mask, omega, 0)
-        odd = container(target_level, f_1, f_0, bc_mask, missing_mask, omega, 1)
-        return {'even': even,
-                'odd':odd}
+        for key in container.keys():
+            containers['odd'][key] = container[key](target_level, f_1, f_0, bc_mask, missing_mask, omega, 1)
+            containers['even'][key] = container[key](target_level, f_0, f_1, bc_mask, missing_mask, omega, 0)
+        return containers
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_launch(self, f_0, f_1, bc_mask, missing_mask,  omega, timestep):

From ca2586c6e1e86f0081bd41e98ab39ab9391c22f7 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 10 Apr 2025 12:10:25 +0200
Subject: [PATCH 015/208] WIP

---
 xlb/helper/nse_multires_solver.py            | 15 +++------------
 xlb/operator/stepper/nse_multires_stepper.py | 16 +++++++++-------
 2 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/xlb/helper/nse_multires_solver.py b/xlb/helper/nse_multires_solver.py
index 0c52ab2a..c5fd233e 100644
--- a/xlb/helper/nse_multires_solver.py
+++ b/xlb/helper/nse_multires_solver.py
@@ -49,6 +49,8 @@ def __init__(self, grid, velocity_set, stepper, omega):
     def __init_containers(self, num_levels):
         # working only with level 0 for now
         self.containers = {}
+        self.macroscopics = {}
+
         for target_level in range(num_levels):
             self.containers[f"{target_level}"] = self.stepper.get_containers(target_level,
                                                      self.f_0,
@@ -59,23 +61,12 @@ def __init_containers(self, num_levels):
                                                      self.iteration_idx)
             pass
 
-        # self.even_step = containers['even']
-        # self.odd_step = containers['odd']
-        #
-        self.macroscopics = {}
-
         for target_level in range(num_levels):
             self.macroscopics[f"{target_level}"] = self.macro.get_containers(target_level, self.f_0, self.f_1, self.bc_mask, self.rho, self.u)
 
-        #
-        # # self.skeleton_even = neon.Skeleton(self.grid.get_neon_backend())
-        # # self.skeleton_odd = neon.Skeleton(self.grid.get_neon_backend())
-        # #
-        # # self.skeleton_even.sequence(name="even lbm", containers=[self.even_step])
-        # # self.skeleton_odd.sequence(name="odd lbm", containers=[self.odd_step])
 
     def export_macroscopic(self, fname_prefix):
-        print("exporting macroscopic")
+        print(f"exporting macroscopic: #levels {self.grid.count_levels}")
         for target_level in range(self.grid.count_levels):
             if self.iteration_idx % 2 == 0:
                 self.macroscopics[f"{target_level}"]['even'][0].run(0)
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index bf412575..40a834cb 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -118,13 +118,15 @@ def _process_boundary_conditions(cls, boundary_conditions, bc_mask, missing_mask
             bc_mask, missing_mask = indices_masker(bc_with_indices, bc_mask, missing_mask, xlb_grid=xlb_grid)
         # Process mesh-based boundary conditions for 3D
         if DefaultConfig.velocity_set.d == 3 and bc_with_vertices:
-            mesh_masker = MeshBoundaryMasker(
-                velocity_set=DefaultConfig.velocity_set,
-                precision_policy=DefaultConfig.default_precision_policy,
-                compute_backend=DefaultConfig.default_backend,
-            )
-            for bc in bc_with_vertices:
-                bc_mask, missing_mask = mesh_masker(bc, bc_mask, missing_mask)
+            # throw an exception because this option is not implemented yet
+            raise Exception("Mesh-based boundary conditions are not implemented yet")
+            # mesh_masker = MeshBoundaryMasker(
+            #     velocity_set=DefaultConfig.velocity_set,
+            #     precision_policy=DefaultConfig.default_precision_policy,
+            #     compute_backend=DefaultConfig.default_backend,
+            # )
+            # for bc in bc_with_vertices:
+            #     bc_mask, missing_mask = mesh_masker(bc, bc_mask, missing_mask)
 
         return bc_mask, missing_mask
 

From 42df87bda22e5a284575d61d7e0da326809b6156 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 10 Apr 2025 17:07:54 +0200
Subject: [PATCH 016/208] WIP

---
 .../performance/mlups_3d_multires_solver.py   | 74 ++++++++-----------
 xlb/helper/initializers.py                    | 22 ++----
 xlb/helper/nse_multires_solver.py             | 22 +++---
 .../mulltires_quadratic_equilibrium.py        | 14 ++--
 .../macroscopic/multires_macroscopic.py       | 28 +++----
 xlb/operator/stepper/nse_multires_stepper.py  | 44 +++++------
 6 files changed, 86 insertions(+), 118 deletions(-)

diff --git a/examples/performance/mlups_3d_multires_solver.py b/examples/performance/mlups_3d_multires_solver.py
index 421ebeb5..b5375568 100644
--- a/examples/performance/mlups_3d_multires_solver.py
+++ b/examples/performance/mlups_3d_multires_solver.py
@@ -70,6 +70,19 @@ def run(backend, precision_policy, grid_shape, num_steps):
     # Create grid and setup boundary conditions
     velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
 
+    def peel(dim, idx, peel_level, outwards):
+        if outwards:
+            xIn =  idx.x <= peel_level or idx.x >= dim.x -1 -peel_level
+            yIn =  idx.y <= peel_level or idx.y >= dim.y -1 -peel_level
+            zIn =  idx.z <= peel_level or idx.z >= dim.z -1 - peel_level
+            return xIn or yIn or zIn
+        else:
+            xIn = idx.x >= peel_level and idx.x <= dim.x - 1 - peel_level
+            yIn = idx.y >= peel_level and idx.y <= dim.y - 1 - peel_level
+            zIn = idx.z >= peel_level and idx.z <= dim.z - 1 - peel_level
+            return xIn and yIn and zIn
+
+
     dim = neon.Index_3d(grid_shape[0],
                         grid_shape[1],
                         grid_shape[2])
@@ -79,49 +92,23 @@ def run(backend, precision_policy, grid_shape, num_steps):
     for i in range(dim.x):
         for j in range(dim.y):
             for k in range(dim.z):
-                if i == 0 or j == 0 or k == 0:
-                    level_zero_mask[i, j, k] = 1
-                if i == dim.x-1 or j == dim.y-1 or k == dim.z-1:
-                    level_zero_mask[i, j, k] = 1
-                if i == 1 or j == 1 or k == 1:
-                    level_zero_mask[i, j, k] = 1
-                if i == dim.x-2 or j == dim.y-2 or k == dim.z-2:
-                    level_zero_mask[i, j, k] = 1
-                if (i == 2 or j == 2 or k == 2):
-                    level_zero_mask[i, j, k] = 1
-                if i == dim.x-3 or j == dim.y-3 or k == dim.z-3:
-                    level_zero_mask[i, j, k] = 1
-                if i == 3 or j == 3 or k == 3:
-                    level_zero_mask[i, j, k] = 1
-                if i == dim.x-4 or j == dim.y-4 or k == dim.z-4:
-                    level_zero_mask[i, j, k] = 1
-
-
-
-    level_one_mask = np.ones((dim.x//2, dim.y//2, dim.z//2), dtype=int)
+                idx = neon.Index_3d(i,j,k)
+                val = 0
+                if peel(dim, idx, 20, True):
+                    val = 1
+                level_zero_mask[i, j, k] = val
+
+
     m = neon.Index_3d(dim.x // 2, dim.y // 2, dim.z // 2)
-    # level_one_mask[0, 0, 0] = 1
-    # # level_one_mask[1, 0, 0] = 1
-    # # level_one_mask[2, 0, 0] = 1
-    # # level_one_mask[2, 0, 0] = 1
-    # # level_one_mask[m.x-3, 0, 0] = 1
-    # # level_one_mask[m.x-2, 0, 0] = 1
-    # # level_one_mask[m.x-1, 0, 0] = 1
-
-    for i in range(dim.x//2):
-        for j in range(dim.y//2):
-            for k in range(dim.z//2):
-                m = neon.Index_3d(dim.x//2,
-                                  dim.y//2,
-                                  dim.z//2)
-                if i == 0 or j == 0 or k == 0:
-                    level_one_mask[i, j, k] = 0
-                if i == m.x-1 or j == m.y-1 or k == m.z-1:
-                    level_one_mask[i, j, k] = 0
-                if i == 1 or j == 1 or k == 1:
-                    level_one_mask[i, j, k] = 0
-                if (i == m.x-2 or j == m.y-2 or k == m.z-2):
-                    level_one_mask[i, j, k] = 0
+    level_one_mask = np.ones((m.x, m.y, m.z), dtype=int)
+    for i in range(m.x):
+        for j in range(m.x):
+            for k in range(m.x):
+                idx = neon.Index_3d(i,j,k)
+                val = 0
+                if peel(dim, idx, dim.x, True) and peel(dim, idx, 3, False):
+                    val = 1
+                level_one_mask[i, j, k] = val
 
     level_one_mask = np.ascontiguousarray(level_one_mask, dtype=np.int32)
 
@@ -138,7 +125,8 @@ def run(backend, precision_policy, grid_shape, num_steps):
 
     prescribed_vel = 0.05
 
-    boundary_conditions = [EquilibriumBC(rho=1.0, u=(prescribed_vel, 0.0, 0.0), indices=lid), FullwayBounceBackBC(indices=walls)]
+    boundary_conditions = [EquilibriumBC(rho=1.0, u=(prescribed_vel, 0.0, 0.0), indices=lid),
+                           FullwayBounceBackBC(indices=walls)]
 
     # Create stepper
     stepper = MultiresIncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="BGK")
diff --git a/xlb/helper/initializers.py b/xlb/helper/initializers.py
index 0b07f480..d41d4b0a 100644
--- a/xlb/helper/initializers.py
+++ b/xlb/helper/initializers.py
@@ -26,19 +26,13 @@ def initialize_eq(f, grid, velocity_set, precision_policy, backend, rho=None, u=
     return f
 
 
-def initialize_multires_eq(f, grid, velocity_set, precision_policy, backend, rho=None, u=None):
-    if rho is None:
-        rho = grid.create_field(cardinality=1, fill_value=1.0, dtype=precision_policy.compute_precision)
-    if u is None:
-        u = grid.create_field(cardinality=velocity_set.d, fill_value=0.0, dtype=precision_policy.compute_precision)
+def initialize_multires_eq(f, grid, velocity_set, precision_policy, backend, rho, u):
     equilibrium = MultiresQuadraticEquilibrium()
-    if backend == ComputeBackend.NEON:
-        for level in range(grid.count_levels):
-            equilibrium(level,rho, u, f)
-        pass
-    else:
-        raise NotImplementedError(f"Backend {backend} not implemented")
-
-    del rho, u
-
+    for level in range(grid.count_levels):
+        print("MultiresQuadraticEquilibrium")
+        equilibrium(level = level,
+                    rho= rho,
+                    u=u,
+                    f=f,
+                    stream= 0)
     return f
\ No newline at end of file
diff --git a/xlb/helper/nse_multires_solver.py b/xlb/helper/nse_multires_solver.py
index c5fd233e..f3899b1c 100644
--- a/xlb/helper/nse_multires_solver.py
+++ b/xlb/helper/nse_multires_solver.py
@@ -16,22 +16,24 @@ def __init__(self, grid, velocity_set, stepper, omega):
         self.omega = omega
         count_levels = grid.count_levels
         # Create fields
-        self.f_0, self.f_1, self.bc_mask, self.missing_mask = stepper.prepare_fields()
-        # self.f_0 = grid.create_field(cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision)
-        # self.f_1 = grid.create_field(cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision)
-        # self.missing_mask = grid.create_field(cardinality=self.velocity_set.q, dtype=Precision.UINT8)
-        # self.bc_mask = grid.create_field(cardinality=1, dtype=Precision.UINT8)
-
         self.rho = grid.create_field(cardinality=1, dtype=self.precision_policy.store_precision)
         self.u = grid.create_field(cardinality=3, dtype=self.precision_policy.store_precision)
-
         fname_prefix='test'
-        self.rho.fill_run(0, 0.0, 0)
+        self.u.fill_run(0, 0.0, 0)
+        self.u.fill_run(1, 0.0, 0)
         self.rho.fill_run(0, 1.0, 0)
+        self.rho.fill_run(1, 1.0, 0)
+        wp.synchronize()
+        self.u.update_host(0)
         wp.synchronize()
-        self.rho.update_host(0)
+        self.u.export_vti(f"u_{fname_prefix}_topology.vti", 'u')
+
+
+        self.f_0, self.f_1, self.bc_mask, self.missing_mask = stepper.prepare_fields(rho=self.rho,u=self.u)
+        wp.synchronize()
+        self.u.update_host(0)
         wp.synchronize()
-        self.rho.export_vti(f"{fname_prefix}_topology.vti", 'u')
+        self.u.export_vti(f"u_t2_{fname_prefix}_topology.vti", 'u')
 
         self.odd_step = None
         self.even_step = None
diff --git a/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py b/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
index 20974f5e..1ce28674 100644
--- a/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
+++ b/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
@@ -81,21 +81,19 @@ def quadratic_equilibrium_cl(index: typing.Any):
                     _rho = wp.neon_read(rho_pn, index, 0)
                     feq = functional(_rho, _u)
 
+                    if wp.neon_has_children(f_pn, index):
+                        for l in range(self.velocity_set.q):
+                            feq[l] = self.compute_dtype(0.0)
                     # Set the output
                     for l in range(self.velocity_set.q):
-                        #wp.neon_write(f_pn, index, l, self.store_dtype(feq[l]))
                         wp.neon_write(f_pn, index, l, feq[l])
-                    if wp.neon_has_children(f_pn, index):
-                        for l in range(self.velocity_set.q):
-                            zero_val = self.compute_dtype(0.0)
-                            wp.neon_write(f_pn, index, l, zero_val)
                 loader.declare_kernel(quadratic_equilibrium_cl)
             return quadratic_equilibrium_ll
         return functional, container
 
     @Operator.register_backend(ComputeBackend.NEON)
-    def neon_implementation(self, leve, rho, u, f):
-        c = self.neon_container( leve, rho, u, f)
-        c.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
+    def neon_implementation(self, level, rho, u, f, stream):
+        c = self.neon_container( level, rho, u, f)
+        c.run(stream, container_runtime=neon.Container.ContainerRuntime.neon)
 
         return f
diff --git a/xlb/operator/macroscopic/multires_macroscopic.py b/xlb/operator/macroscopic/multires_macroscopic.py
index cc657903..ec467255 100644
--- a/xlb/operator/macroscopic/multires_macroscopic.py
+++ b/xlb/operator/macroscopic/multires_macroscopic.py
@@ -62,6 +62,7 @@ def warp_implementation(self, f, rho, u):
     def _construct_neon(self):
         zero_moment_func = self.zero_moment.neon_functional
         first_moment_func = self.first_moment.neon_functional
+        print(f"VELOCITY SET: {self.velocity_set.q}")
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
 
         @wp.func
@@ -86,10 +87,10 @@ def container(
             def macroscopic_ll(loader: neon.Loader):
                 loader.set_mres_grid(f_field.get_grid(), level)
 
-                rho=loader.get_mres_read_handle(rho_field)
-                u =loader.get_mres_read_handle(u_fild)
-                f=loader.get_mres_write_handle(f_field)
-                bc_mask_pn = loader.get_mres_write_handle(bc_mask)
+                rho=loader.get_mres_write_handle(rho_field)
+                u =loader.get_mres_write_handle(u_fild)
+                f=loader.get_mres_read_handle(f_field)
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask)
 
                 @wp.func
                 def macroscopic_cl(gIdx: typing.Any):
@@ -98,12 +99,15 @@ def macroscopic_cl(gIdx: typing.Any):
 
                     for l in range(self.velocity_set.q):
                         _f[l] = wp.neon_read(f, gIdx,l)
+
                     _rho, _u = functional(_f)
+
                     if _boundary_id != wp.uint8(0):
                         _rho = self.compute_dtype(1.0)
                         for d in range(_d):
                             _u[d] = self.compute_dtype(0.0)
-                    if _boundary_id == wp.uint8(255):
+
+                    if _boundary_id == wp.uint8(255) or wp.neon_has_children(f, gIdx):
                         _rho = self.compute_dtype(0.0)
                         for d in range(_d):
                             _u[d] = self.compute_dtype(0.0)
@@ -112,20 +116,6 @@ def macroscopic_cl(gIdx: typing.Any):
                     for d in range(_d):
                         wp.neon_write(u, gIdx, d, _u[d])
 
-                    if wp.neon_has_children(f, gIdx):
-                        offVal = self.compute_dtype(-33000.0)
-                        zero_val = self.compute_dtype(0.0)
-                        wp.neon_write(rho, gIdx, 0, zero_val)
-                        wp.neon_write(u, gIdx, 0, offVal)
-                        wp.neon_write(u, gIdx, 1, zero_val)
-                        wp.neon_write(u, gIdx, 2, zero_val)
-                    else:
-                        offVal = self.compute_dtype(+33000.0)
-                        zero_val = self.compute_dtype(0.0)
-                        wp.neon_write(rho, gIdx, 0, zero_val)
-                        wp.neon_write(u, gIdx, 0, offVal)
-                        wp.neon_write(u, gIdx, 1, zero_val)
-                        wp.neon_write(u, gIdx, 2, zero_val)
                 loader.declare_kernel(macroscopic_cl)
             return macroscopic_ll
         return functional, container
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 40a834cb..efd6d6d3 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -22,7 +22,6 @@
 from xlb.operator.collision import ForcedCollision
 from xlb.operator.boundary_masker import IndicesBoundaryMasker, MeshBoundaryMasker
 from xlb.helper import check_bc_overlaps
-from xlb.helper.nse_solver import create_nse_fields
 
 
 class MultiresIncompressibleNavierStokesStepper(Stepper):
@@ -53,7 +52,7 @@ def __init__(
         self.equilibrium = QuadraticEquilibrium(self.velocity_set, self.precision_policy, self.compute_backend)
         self.macroscopic = Macroscopic(self.velocity_set, self.precision_policy, self.compute_backend)
 
-    def prepare_fields(self, initializer=None):
+    def prepare_fields(self, rho, u, initializer=None):
         """Prepare the fields required for the stepper.
 
         Args:
@@ -69,32 +68,33 @@ def prepare_fields(self, initializer=None):
                 - bc_mask: Boundary condition mask indicating which BC applies to each node
                 - missing_mask: Mask indicating which populations are missing at boundary nodes
         """
-        # Create fields using the helper function
-        _, f_0, f_1, missing_mask, bc_mask = create_nse_fields(
-            grid=self.grid, velocity_set=self.velocity_set, compute_backend=self.compute_backend, precision_policy=self.precision_policy
-        )
 
-        # Initialize distribution functions if initializer is provided
-        if initializer is not None:
-            # throw an exception because this option is not implemented yet
-            raise Exception("Initializer is not implemented yet")
-            #f_0 = initializer(self.grid, self.velocity_set, self.precision_policy, self.compute_backend)
-        else:
-            from xlb.helper.initializers import initialize_multires_eq
-            f_0 = initialize_multires_eq(f_0, self.grid, self.velocity_set, self.precision_policy, self.compute_backend)
+        f_0 = self.grid.create_field(cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision)
+        f_1 = self.grid.create_field(cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision)
+        missing_mask = self.grid.create_field(cardinality=self.velocity_set.q, dtype=Precision.UINT8)
+        bc_mask = self.grid.create_field(cardinality=1, dtype=Precision.UINT8)
+
+        from xlb.helper.initializers import initialize_multires_eq
+        f_0 = initialize_multires_eq(f_0, self.grid, self.velocity_set, self.precision_policy, self.compute_backend, rho=rho, u=u)
+
+        for level in range(self.grid.count_levels):
+            f_1.copy_from_run(level, f_0, 0)
+        f_0.update_host(0)
+        wp.synchronize()
+        f_0.export_vti("f0_eq_init.vti", "init_f0")
 
-        if self.compute_backend == ComputeBackend.NEON:
-            for level in range(self.grid.count_levels):
-                f_1.copy_from_run(level, f_0, 0)
 
         # Process boundary conditions and update masks
         bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, bc_mask, missing_mask, xlb_grid=self.grid)
         # Initialize auxiliary data if needed
         f_0, f_1 = self._initialize_auxiliary_data(self.boundary_conditions, f_0, f_1, bc_mask, missing_mask)
+        # bc_mask.update_host(0)
         bc_mask.update_host(0)
-        missing_mask.update_host(0)
+        f_0.update_host(0)
         wp.synchronize()
         bc_mask.export_vti("bc_mask.vti", 'bc_mask')
+        f_0.export_vti("init_f0.vti", 'init_f0')
+
         #missing_mask.export_vti("missing_mask.vti", 'missing_mask')
 
         return f_0, f_1, bc_mask, missing_mask
@@ -357,10 +357,7 @@ def cl_collide_coarse(index: typing.Any):
                     else:
                         for l in range(self.velocity_set.q):
                             wp.neon_write(f_1_pn, index, l, self.compute_dtype(0))
-
-                    wp.print("collide_coarse")
-
-
+                    wp.print("stream_coarse")
 
                 loader.declare_kernel(cl_collide_coarse)
             return ll_collide_coarse
@@ -412,8 +409,7 @@ def cl_stream_coarse(index: typing.Any):
                                     is_valid = wp.bool(False)
                                     read_accumulate_date = wp.neon_ngh_data(f_1_pn, index, pull_direction, l, self.compute_dtype(0),is_valid)
                                     if is_valid:
-                                        wp.print("read_accumulate_date")
-                                        _f_post_stream[l] = self.compute_dtype(33) #read_accumulate_date * self.compute_dtype(0.5)
+                                        _f_post_stream[l] = read_accumulate_date #read_accumulate_date * self.compute_dtype(0.5)
 
                             # do non mres post-streaming corrections
                             _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_stream, True)

From 1337a2b2cb23e5943155f21e0cac586902bbd176 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 10 Apr 2025 21:55:12 +0200
Subject: [PATCH 017/208] WIP

---
 examples/performance/mlups_3d_multires_solver.py | 2 +-
 xlb/helper/nse_multires_solver.py                | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/examples/performance/mlups_3d_multires_solver.py b/examples/performance/mlups_3d_multires_solver.py
index b5375568..6731725b 100644
--- a/examples/performance/mlups_3d_multires_solver.py
+++ b/examples/performance/mlups_3d_multires_solver.py
@@ -126,7 +126,7 @@ def peel(dim, idx, peel_level, outwards):
     prescribed_vel = 0.05
 
     boundary_conditions = [EquilibriumBC(rho=1.0, u=(prescribed_vel, 0.0, 0.0), indices=lid),
-                           FullwayBounceBackBC(indices=walls)]
+                           EquilibriumBC(rho=1.0, u=(0.0, 0.0, 0.0), indices=walls)]
 
     # Create stepper
     stepper = MultiresIncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="BGK")
diff --git a/xlb/helper/nse_multires_solver.py b/xlb/helper/nse_multires_solver.py
index f3899b1c..27f7cb7a 100644
--- a/xlb/helper/nse_multires_solver.py
+++ b/xlb/helper/nse_multires_solver.py
@@ -91,17 +91,11 @@ def step(self):
 
         if self.iteration_idx % 2 == 0:
             self.containers["1"]["even"]['collide_coarse'].run(0)
-            wp.synchronize()
             self.containers["0"]["even"]['collide_coarse'].run(0)
-            wp.synchronize()
             self.containers["0"]["even"]['stream_coarse'].run(0)
-            wp.synchronize()
             self.containers["0"]["odd"]['collide_coarse'].run(0)
-            wp.synchronize()
             self.containers["0"]["odd"]['stream_coarse'].run(0)
-            wp.synchronize()
             self.containers["1"]["even"]['stream_coarse'].run(0)
-            wp.synchronize()
         else:
             self.containers["1"]["odd"]["collide_coarse"].run(0)
             self.containers["0"]["even"]["collide_coarse"].run(0)

From 090837e13629df63a2cd5472dc38bc2bd90994fc Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Tue, 22 Apr 2025 15:47:36 +0200
Subject: [PATCH 018/208] WIP: single level

---
 .../mlups_3d_multires_solver_single_level.py  | 186 +++++++
 xlb/helper/nse_multires_solver.py             | 136 +++--
 .../macroscopic/multires_macroscopic.py       |  18 +
 xlb/operator/stepper/nse_multires_stepper.py  |  25 +-
 .../stepper/nse_multires_stepper_vk.py        | 481 ++++++++++++++++++
 5 files changed, 794 insertions(+), 52 deletions(-)
 create mode 100644 examples/performance/mlups_3d_multires_solver_single_level.py
 create mode 100644 xlb/operator/stepper/nse_multires_stepper_vk.py

diff --git a/examples/performance/mlups_3d_multires_solver_single_level.py b/examples/performance/mlups_3d_multires_solver_single_level.py
new file mode 100644
index 00000000..73095630
--- /dev/null
+++ b/examples/performance/mlups_3d_multires_solver_single_level.py
@@ -0,0 +1,186 @@
+import xlb
+import argparse
+import time
+import warp as wp
+import numpy as np
+
+# add a directory to the PYTHON PATH
+import sys
+# sys.path.append('/home/max/repos/neon/warping/neon_warp_testing/neon_py_bindings/py/')
+import neon
+
+from xlb.compute_backend import ComputeBackend
+from xlb.precision_policy import PrecisionPolicy
+from xlb.grid import multires_grid_factory
+from xlb.operator.stepper import MultiresIncompressibleNavierStokesStepper
+from xlb.operator.boundary_condition import FullwayBounceBackBC, EquilibriumBC
+from xlb.distribute import distribute
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation (BGK)")
+    # Positional arguments
+    parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid")
+    parser.add_argument("num_steps", type=int, help="Timestep for the simulation")
+    parser.add_argument("backend", type=str, help="Backend for the simulation (jax, warp or neon)")
+    parser.add_argument("precision", type=str, help="Precision for the simulation (e.g., fp32/fp32)")
+
+    # Optional arguments
+    parser.add_argument("--num_devices", type=int, default=0, help="Number of devices for the simulation (default: 0)")
+    parser.add_argument("--velocity_set", type=str, default='D3Q19',
+                        help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)"
+                        )
+
+    return parser.parse_args()
+
+
+def setup_simulation(args):
+    backend = None
+    if args.backend == "jax": backend = ComputeBackend.JAX
+    elif args.backend == "warp": backend = ComputeBackend.WARP
+    elif args.backend == "neon": backend = ComputeBackend.NEON
+    if backend is None:
+        raise ValueError("Invalid backend")
+
+    precision_policy_map = {
+        "fp32/fp32": PrecisionPolicy.FP32FP32,
+        "fp64/fp64": PrecisionPolicy.FP64FP64,
+        "fp64/fp32": PrecisionPolicy.FP64FP32,
+        "fp32/fp16": PrecisionPolicy.FP32FP16,
+    }
+    precision_policy = precision_policy_map.get(args.precision)
+    if precision_policy is None:
+        raise ValueError("Invalid precision")
+
+    velocity_set = None
+    if args.velocity_set == 'D3Q19': velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
+    elif args.velocity_set == 'D3Q27': velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, backend=backend)
+    if velocity_set is None:
+        raise ValueError("Invalid velocity set")
+
+    xlb.init(
+        velocity_set=velocity_set,
+        default_backend=backend,
+        default_precision_policy=precision_policy,
+    )
+
+    return backend, precision_policy
+
+
+def run(backend, precision_policy, grid_shape, num_steps):
+    # Create grid and setup boundary conditions
+    velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
+
+    def peel(dim, idx, peel_level, outwards):
+        if outwards:
+            xIn =  idx.x <= peel_level or idx.x >= dim.x -1 -peel_level
+            yIn =  idx.y <= peel_level or idx.y >= dim.y -1 -peel_level
+            zIn =  idx.z <= peel_level or idx.z >= dim.z -1 - peel_level
+            return xIn or yIn or zIn
+        else:
+            xIn = idx.x >= peel_level and idx.x <= dim.x - 1 - peel_level
+            yIn = idx.y >= peel_level and idx.y <= dim.y - 1 - peel_level
+            zIn = idx.z >= peel_level and idx.z <= dim.z - 1 - peel_level
+            return xIn and yIn and zIn
+
+
+    dim = neon.Index_3d(grid_shape[0],
+                        grid_shape[1],
+                        grid_shape[2])
+    level_zero_mask = np.ones((dim.x, dim.y, dim.z), dtype=int)
+    level_zero_mask = np.ascontiguousarray(level_zero_mask, dtype=np.int32)
+
+
+    grid = multires_grid_factory(grid_shape, velocity_set=velocity_set,
+                                 sparsity_pattern_list=[ level_zero_mask,],
+                                 sparsity_pattern_origins=[ neon.Index_3d(0, 0, 0),])
+
+    box = grid.bounding_box_indices()
+    box_no_edge = grid.bounding_box_indices(remove_edges=True)
+    lid = box_no_edge["top"]
+    walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in range(len(grid.shape))]
+    walls = np.unique(np.array(walls), axis=-1).tolist()
+
+    prescribed_vel = 0.05
+
+    boundary_conditions = [EquilibriumBC(rho=1.0, u=(prescribed_vel, 0.0, 0.0), indices=lid),
+                           EquilibriumBC(rho=1.0, u=(0.0, 0.0, 0.0), indices=walls)]
+
+    # Create stepper
+    stepper = MultiresIncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="BGK")
+
+    Re = 10000.0
+    clength = grid_shape[0] - 1
+    visc = prescribed_vel * clength / Re
+    omega = 1.0 / (3.0 * visc + 0.5)
+
+    # # Initialize fields and run simulation
+    # omega = 1.0
+
+    sim = xlb.helper.Nse_multires_simulation(grid, velocity_set, stepper, omega)
+
+    sim.export_macroscopic("Initial_")
+
+    print("start timing")
+    start_time = time.time()
+
+    for i in range(num_steps):
+        print(f"step {i}")
+        sim.step()
+        if i%1 == 0:
+            sim.export_macroscopic("u_lid_driven_cavity_")
+    wp.synchronize()
+    t = time.time() - start_time
+
+    sim.export_macroscopic("u_lid_driven_cavity_")
+    return t
+
+
+def calculate_mlups(cube_edge, num_steps, elapsed_time):
+    total_lattice_updates = cube_edge**3 * num_steps
+    mlups = (total_lattice_updates / elapsed_time) / 1e6
+    return mlups
+
+def post_process(macro, rho, u, f_0,  i):
+    # Write the results. We'll use JAX backend for the post-processing
+    # import jax.numpy as jnp
+    # if not isinstance(f_0, jnp.ndarray):
+    #     # If the backend is warp, we need to drop the last dimension added by warp for 2D simulations
+    #     f_0 = wp.to_jax(f_0)[..., 0]
+    # else:
+    #     f_0 = f_0
+    rho, u = macro(f_0, rho, u )
+    wp.synchronize()
+    u.update_host(0)
+    rho.update_host(0)
+    wp.synchronize()
+    u.export_vti(f"u_lid_driven_cavity_{i}.vti", 'u')
+    rho.export_vti(f"rho_lid_driven_cavity_{i}.vti", 'rho')
+
+    pass
+
+    # # remove boundary cells
+    # rho = rho[:, 1:-1, 1:-1, 1:-1]
+    # u = u[:, 1:-1, 1:-1, 1:-1]
+    # u_magnitude = (u[0] ** 2 + u[1] ** 2) ** 0.5
+    #
+    # fields = {"rho": rho[0], "u_x": u[0], "u_y": u[1], "u_magnitude": u_magnitude}
+    #
+    # # save_fields_vtk(fields, timestep=i, prefix="lid_driven_cavity")
+    # ny=fields["u_magnitude"].shape[1]
+    # from xlb.utils import  save_image
+    # save_image(fields["u_magnitude"][:, ny//2, :], timestep=i, prefix="lid_driven_cavity")
+
+def main():
+
+    args = parse_arguments()
+    backend, precision_policy = setup_simulation(args)
+    grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
+    elapsed_time = run(backend, precision_policy, grid_shape, args.num_steps)
+    mlups = calculate_mlups(args.cube_edge, args.num_steps, elapsed_time)
+
+    print(f"Simulation completed in {elapsed_time:.2f} seconds")
+    print(f"MLUPs: {mlups:.2f}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/xlb/helper/nse_multires_solver.py b/xlb/helper/nse_multires_solver.py
index 27f7cb7a..a3a9bfa7 100644
--- a/xlb/helper/nse_multires_solver.py
+++ b/xlb/helper/nse_multires_solver.py
@@ -14,21 +14,20 @@ def __init__(self, grid, velocity_set, stepper, omega):
         self.precision_policy = stepper.get_precision_policy()
         self.velocity_set = velocity_set
         self.omega = omega
-        count_levels = grid.count_levels
+        self.count_levels = grid.count_levels
         # Create fields
         self.rho = grid.create_field(cardinality=1, dtype=self.precision_policy.store_precision)
         self.u = grid.create_field(cardinality=3, dtype=self.precision_policy.store_precision)
         fname_prefix='test'
-        self.u.fill_run(0, 0.0, 0)
-        self.u.fill_run(1, 0.0, 0)
-        self.rho.fill_run(0, 1.0, 0)
-        self.rho.fill_run(1, 1.0, 0)
+
+        for level in range(self.count_levels):
+            self.u.fill_run(level, 0.0, 0)
+            self.rho.fill_run(level, 1.0, 0)
         wp.synchronize()
         self.u.update_host(0)
         wp.synchronize()
         self.u.export_vti(f"u_{fname_prefix}_topology.vti", 'u')
 
-
         self.f_0, self.f_1, self.bc_mask, self.missing_mask = stepper.prepare_fields(rho=self.rho,u=self.u)
         wp.synchronize()
         self.u.update_host(0)
@@ -46,35 +45,31 @@ def __init__(self, grid, velocity_set, stepper, omega):
             velocity_set=self.velocity_set,
         )
 
-        self.__init_containers(count_levels)
+        self.__init_containers(self.count_levels)
 
     def __init_containers(self, num_levels):
         # working only with level 0 for now
         self.containers = {}
         self.macroscopics = {}
 
-        for target_level in range(num_levels):
-            self.containers[f"{target_level}"] = self.stepper.get_containers(target_level,
-                                                     self.f_0,
-                                                     self.f_1,
-                                                     self.bc_mask,
-                                                     self.missing_mask,
-                                                     self.omega,
-                                                     self.iteration_idx)
-            pass
-
-        for target_level in range(num_levels):
-            self.macroscopics[f"{target_level}"] = self.macro.get_containers(target_level, self.f_0, self.f_1, self.bc_mask, self.rho, self.u)
-
+        # for target_level in range(num_levels):
+        #     self.containers[f"{target_level}"] = self.stepper.get_containers(target_level,
+        #                                              self.f_0,
+        #                                              self.f_1,
+        #                                              self.bc_mask,
+        #                                              self.missing_mask,
+        #                                              self.omega,
+        #                                              self.iteration_idx)
+        #     pass
+
+        # for target_level in range(num_levels):
+        #     self.macroscopics[f"{target_level}"] = self.macro.get_containers(target_level, self.f_0, self.f_1, self.bc_mask, self.rho, self.u)
+        self.stepper.init_containers()
+        self.macro.init_containers()
 
     def export_macroscopic(self, fname_prefix):
         print(f"exporting macroscopic: #levels {self.grid.count_levels}")
-        for target_level in range(self.grid.count_levels):
-            if self.iteration_idx % 2 == 0:
-                self.macroscopics[f"{target_level}"]['even'][0].run(0)
-            else:
-                self.macroscopics[f"{target_level}"]['odd'][0].run(0)
-
+        self.macro.launch_container(streamId = 0, f_0 = self.f_0, bc_mask = self.bc_mask, rho = self.rho, u = self.u)
 
         import warp as wp
         wp.synchronize()
@@ -88,18 +83,79 @@ def export_macroscopic(self, fname_prefix):
     # one step at the corase level
     def step(self):
         self.iteration_idx += 1
+        iteration_id = self.iteration_idx % 2
+
+        # op_name, mres_level, f_0, f_1, bc_mask, missing_mask,  omega, timestep
+        ## LEVEL 1
+        if  self.count_levels == 2:
+            self.stepper.launch_container(
+            streamId= 0,
+            op_name="collide_coarse",
+            mres_level=1,
+            f_0=self.f_0,
+            f_1=self.f_1,
+            bc_mask=self.bc_mask,
+            missing_mask=self.missing_mask,
+            omega=self.omega,
+            timestep=iteration_id,
+            )
+        ## LEVEL 0
+        self.stepper.launch_container(
+            streamId= 0,
+            op_name="collide_coarse",
+            mres_level=0,
+            f_0=self.f_0,
+            f_1=self.f_1,
+            bc_mask=self.bc_mask,
+            missing_mask=self.missing_mask,
+            omega=self.omega,
+            timestep=iteration_id,
+        )
+        self.stepper.launch_container(
+                            streamId= 0,
+            op_name="stream_coarse",
+            mres_level=0,
+            f_0=self.f_1,
+            f_1=self.f_0,
+            bc_mask=self.bc_mask,
+            missing_mask=self.missing_mask,
+            omega=self.omega,
+            timestep=iteration_id,
+        )
+        self.stepper.launch_container(
+                            streamId= 0,
+            op_name="collide_coarse",
+            mres_level=0,
+            f_0=self.f_0,
+            f_1=self.f_1,
+            bc_mask=self.bc_mask,
+            missing_mask=self.missing_mask,
+            omega=self.omega,
+            timestep=iteration_id,
+        )
+        self.stepper.launch_container(
+            streamId= 0,
+            op_name="stream_coarse",
+            mres_level=0,
+            f_0=self.f_1,
+            f_1=self.f_0,
+            bc_mask=self.bc_mask,
+            missing_mask=self.missing_mask,
+            omega=self.omega,
+            timestep=iteration_id,
+        )
+        # LEVEL 0
+        if  self.count_levels == 2:
+            self.stepper.launch_container(
+                streamId= 0,
+                op_name="collide_coarse",
+                mres_level=1,
+                f_0=self.f_1,
+                f_1=self.f_0,
+                bc_mask=self.bc_mask,
+                missing_mask=self.missing_mask,
+                omega=self.omega,
+                timestep=iteration_id,
+            )
+
 
-        if self.iteration_idx % 2 == 0:
-            self.containers["1"]["even"]['collide_coarse'].run(0)
-            self.containers["0"]["even"]['collide_coarse'].run(0)
-            self.containers["0"]["even"]['stream_coarse'].run(0)
-            self.containers["0"]["odd"]['collide_coarse'].run(0)
-            self.containers["0"]["odd"]['stream_coarse'].run(0)
-            self.containers["1"]["even"]['stream_coarse'].run(0)
-        else:
-            self.containers["1"]["odd"]["collide_coarse"].run(0)
-            self.containers["0"]["even"]["collide_coarse"].run(0)
-            self.containers["0"]["even"]["stream_coarse"].run(0)
-            self.containers["0"]["odd"]["collide_coarse"].run(0)
-            self.containers["0"]["odd"]["stream_coarse"].run(0)
-            self.containers["1"]["odd"]["stream_coarse"].run(0)
diff --git a/xlb/operator/macroscopic/multires_macroscopic.py b/xlb/operator/macroscopic/multires_macroscopic.py
index ec467255..18d7a925 100644
--- a/xlb/operator/macroscopic/multires_macroscopic.py
+++ b/xlb/operator/macroscopic/multires_macroscopic.py
@@ -129,6 +129,24 @@ def get_containers(self, target_level, f_0, f_1, bc_mask, rho, u):
         return {'even':evenList ,
                 'odd':oddList }
 
+    def get_container(self, target_level, f_0, f_1, bc_mask, rho, u):
+        _, self.container = self._construct_neon()
+        evenList = []
+        oddList = []
+        evenList.append(container(target_level, f_0, bc_mask, rho, u))
+        oddList.append(container(target_level, f_1, bc_mask, rho, u))
+        return {"macro": evenList, "odd": oddList}
+
+    def init_containers(self):
+        self.containers=None
+        _, self.containers = self._construct_neon()
+
+    def launch_container(self, streamId, f_0,  bc_mask, rho, u):
+        grid = f_0.get_grid()
+        for target_level in range(grid.num_levels):
+                self.containers(target_level, f_0, bc_mask, rho, u).run(streamId)
+
+
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, f, rho, u):
         c = self.neon_container(f, rho, u)
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index efd6d6d3..cdfe854a 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -83,7 +83,6 @@ def prepare_fields(self, rho, u, initializer=None):
         wp.synchronize()
         f_0.export_vti("f0_eq_init.vti", "init_f0")
 
-
         # Process boundary conditions and update masks
         bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, bc_mask, missing_mask, xlb_grid=self.grid)
         # Initialize auxiliary data if needed
@@ -256,7 +255,6 @@ def cl_single_step_finest(index: typing.Any):
                     _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, True)
                     _opposite_c_idx = self.velocity_set.self.opp_indices
 
-
                     for l in range(self.velocity_set.q):
                         push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]),
                                                    wp.int8(_c[1, l]),
@@ -290,11 +288,9 @@ def cl_single_step_finest(index: typing.Any):
                                 if is_valid:
                                     wp.neon_write(f_1_pn, index, l, _f_post_stream[l], value)
 
-
                 loader.declare_kernel(cl_single_step_finest)
             return ll_single_step_finest
 
-
         @neon.Container.factory(name="collide_coarse")
         def collide_coarse(
                 level: int,
@@ -308,7 +304,7 @@ def collide_coarse(
             num_levels = f_0_fd.get_grid().get_num_levels()
 
             # module op to define odd of even iteration
-            od_or_even = wp.mod(timestep, 2)
+            even_itertation = wp.mod(timestep, 2)==0
 
             def ll_collide_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
@@ -348,10 +344,10 @@ def cl_collide_coarse(index: typing.Any):
                             push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
                             if(level < num_levels - 1):
                                 ## Store
-                                if od_or_even == 0:
-                                    wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, _f_post_collision[l])
-                                else:
-                                    wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, _f_post_collision[l])
+                                # if even_itertation == 0:
+                                #     wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, _f_post_collision[l])
+                                # else:
+                                wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, _f_post_collision[l])
 
                             wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
                     else:
@@ -427,8 +423,6 @@ def cl_stream_coarse(index: typing.Any):
             "collide_coarse": collide_coarse,
             "stream_coarse": stream_coarse}
 
-
-
     def get_containers(self, target_level,  f_0, f_1, bc_mask, missing_mask,  omega, timestep):
         containers = {'even': {}, 'odd': {}}
         _, container = self._construct_neon()
@@ -437,6 +431,13 @@ def get_containers(self, target_level,  f_0, f_1, bc_mask, missing_mask,  omega,
             containers['even'][key] = container[key](target_level, f_0, f_1, bc_mask, missing_mask, omega, 0)
         return containers
 
+    def init_containers(self):
+        self.containers=None
+        _, self.containers = self._construct_neon()
+
+    def launch_container(self, streamId, op_name, mres_level, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
+        self.containers[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep).run(0)
+
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_launch(self, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
         #if self.c is None:
@@ -450,7 +451,7 @@ def neon_launch(self, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
         # if c is None:
         #     pass
         c = self.neon_container(f_0, f_1, bc_mask, missing_mask, omega, timestep)
-        c.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
+        c.run(0)
         #
         # if self.odd_or_even == 'even':
         #     c = self.c_even
diff --git a/xlb/operator/stepper/nse_multires_stepper_vk.py b/xlb/operator/stepper/nse_multires_stepper_vk.py
new file mode 100644
index 00000000..bd3622a9
--- /dev/null
+++ b/xlb/operator/stepper/nse_multires_stepper_vk.py
@@ -0,0 +1,481 @@
+# Base class for all stepper operators
+
+from functools import partial
+
+from docutils.nodes import container
+from jax import jit
+import warp as wp
+import neon
+from typing import Any
+
+from xlb import DefaultConfig
+from xlb.compute_backend import ComputeBackend
+from xlb.precision_policy import Precision
+from xlb.operator import Operator
+from xlb.operator.stream import Stream
+from xlb.operator.collision import BGK, KBC
+from xlb.operator.equilibrium import QuadraticEquilibrium
+from xlb.operator.macroscopic import Macroscopic
+from xlb.operator.stepper import Stepper
+from xlb.operator.boundary_condition.boundary_condition import ImplementationStep
+from xlb.operator.boundary_condition.boundary_condition_registry import boundary_condition_registry
+from xlb.operator.collision import ForcedCollision
+from xlb.operator.boundary_masker import IndicesBoundaryMasker, MeshBoundaryMasker
+from xlb.helper import check_bc_overlaps
+
+
+class MultiresIncompressibleNavierStokesStepper(Stepper):
+    def __init__(
+        self,
+        grid,
+        boundary_conditions=[],
+        collision_type="BGK",
+        forcing_scheme="exact_difference",
+        force_vector=None,
+    ):
+        super().__init__(grid, boundary_conditions)
+        self.odd_or_even='even'
+        self.c_even = None
+        self.c_odd = None
+
+        # Construct the collision operator
+        if collision_type == "BGK":
+            self.collision = BGK(self.velocity_set, self.precision_policy, self.compute_backend)
+        elif collision_type == "KBC":
+            self.collision = KBC(self.velocity_set, self.precision_policy, self.compute_backend)
+
+        if force_vector is not None:
+            self.collision = ForcedCollision(collision_operator=self.collision, forcing_scheme=forcing_scheme, force_vector=force_vector)
+
+        # Construct the operators
+        self.stream = Stream(self.velocity_set, self.precision_policy, self.compute_backend)
+        self.equilibrium = QuadraticEquilibrium(self.velocity_set, self.precision_policy, self.compute_backend)
+        self.macroscopic = Macroscopic(self.velocity_set, self.precision_policy, self.compute_backend)
+
+    def prepare_fields(self, rho, u, initializer=None):
+        """Prepare the fields required for the stepper.
+
+        Args:
+            initializer: Optional operator to initialize the distribution functions.
+                        If provided, it should be a callable that takes (grid, velocity_set,
+                        precision_policy, compute_backend) as arguments and returns initialized f_0.
+                        If None, default equilibrium initialization is used with rho=1 and u=0.
+
+        Returns:
+            Tuple of (f_0, f_1, bc_mask, missing_mask):
+                - f_0: Initial distribution functions
+                - f_1: Copy of f_0 for double-buffering
+                - bc_mask: Boundary condition mask indicating which BC applies to each node
+                - missing_mask: Mask indicating which populations are missing at boundary nodes
+        """
+
+        f_0 = self.grid.create_field(cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision)
+        f_1 = self.grid.create_field(cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision)
+        missing_mask = self.grid.create_field(cardinality=self.velocity_set.q, dtype=Precision.UINT8)
+        bc_mask = self.grid.create_field(cardinality=1, dtype=Precision.UINT8)
+
+        from xlb.helper.initializers import initialize_multires_eq
+        f_0 = initialize_multires_eq(f_0, self.grid, self.velocity_set, self.precision_policy, self.compute_backend, rho=rho, u=u)
+
+        for level in range(self.grid.count_levels):
+            f_1.copy_from_run(level, f_0, 0)
+        f_0.update_host(0)
+        wp.synchronize()
+        f_0.export_vti("f0_eq_init.vti", "init_f0")
+
+        # Process boundary conditions and update masks
+        bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, bc_mask, missing_mask, xlb_grid=self.grid)
+        # Initialize auxiliary data if needed
+        f_0, f_1 = self._initialize_auxiliary_data(self.boundary_conditions, f_0, f_1, bc_mask, missing_mask)
+        # bc_mask.update_host(0)
+        bc_mask.update_host(0)
+        f_0.update_host(0)
+        wp.synchronize()
+        bc_mask.export_vti("bc_mask.vti", 'bc_mask')
+        f_0.export_vti("init_f0.vti", 'init_f0')
+
+        #missing_mask.export_vti("missing_mask.vti", 'missing_mask')
+
+        return f_0, f_1, bc_mask, missing_mask
+
+    @classmethod
+    def _process_boundary_conditions(cls, boundary_conditions, bc_mask, missing_mask, xlb_grid=None):
+        """Process boundary conditions and update boundary masks."""
+        # Check for boundary condition overlaps
+        check_bc_overlaps(boundary_conditions, DefaultConfig.velocity_set.d, DefaultConfig.default_backend)
+        # Create boundary maskers
+        indices_masker = IndicesBoundaryMasker(
+            velocity_set=DefaultConfig.velocity_set,
+            precision_policy=DefaultConfig.default_precision_policy,
+            compute_backend=DefaultConfig.default_backend,
+        )
+        # Split boundary conditions by type
+        bc_with_vertices = [bc for bc in boundary_conditions if bc.mesh_vertices is not None]
+        bc_with_indices = [bc for bc in boundary_conditions if bc.indices is not None]
+        # Process indices-based boundary conditions
+        if bc_with_indices:
+            bc_mask, missing_mask = indices_masker(bc_with_indices, bc_mask, missing_mask, xlb_grid=xlb_grid)
+        # Process mesh-based boundary conditions for 3D
+        if DefaultConfig.velocity_set.d == 3 and bc_with_vertices:
+            # throw an exception because this option is not implemented yet
+            raise Exception("Mesh-based boundary conditions are not implemented yet")
+            # mesh_masker = MeshBoundaryMasker(
+            #     velocity_set=DefaultConfig.velocity_set,
+            #     precision_policy=DefaultConfig.default_precision_policy,
+            #     compute_backend=DefaultConfig.default_backend,
+            # )
+            # for bc in bc_with_vertices:
+            #     bc_mask, missing_mask = mesh_masker(bc, bc_mask, missing_mask)
+
+        return bc_mask, missing_mask
+
+    @staticmethod
+    def _initialize_auxiliary_data(boundary_conditions, f_0, f_1, bc_mask, missing_mask):
+        """Initialize auxiliary data for boundary conditions that require it."""
+        for bc in boundary_conditions:
+            if bc.needs_aux_init and not bc.is_initialized_with_aux_data:
+                f_0, f_1 = bc.aux_data_init(f_0, f_1, bc_mask, missing_mask)
+        return f_0, f_1
+
+    def _construct_neon(self):
+        # Set local constants
+        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
+        _missing_mask_vec = wp.vec(self.velocity_set.q, dtype=wp.uint8)
+        _opp_indices = self.velocity_set.opp_indices
+        #_cast_to_store_dtype = self.store_dtype()
+
+        # Read the list of bc_to_id created upon instantiation
+        bc_to_id = boundary_condition_registry.bc_to_id
+        id_to_bc = boundary_condition_registry.id_to_bc
+        _zero = self.compute_dtype(0)
+        # Gather IDs of ExtrapolationOutflowBC boundary conditions
+        extrapolation_outflow_bc_ids = []
+        for bc_name, bc_id in bc_to_id.items():
+            if bc_name.startswith("ExtrapolationOutflowBC"):
+                extrapolation_outflow_bc_ids.append(bc_id)
+        # Group active boundary conditions
+        active_bcs = set(boundary_condition_registry.id_to_bc[bc.id] for bc in self.boundary_conditions)
+
+        @wp.func
+        def apply_bc(
+            index: Any,
+            timestep: Any,
+            _boundary_id: Any,
+            missing_mask: Any,
+            f_0: Any,
+            f_1: Any,
+            f_pre: Any,
+            f_post: Any,
+            is_post_streaming: bool,
+        ):
+            f_result = f_post
+
+            # Unroll the loop over boundary conditions
+            for i in range(wp.static(len(self.boundary_conditions))):
+                if is_post_streaming:
+                    if wp.static(self.boundary_conditions[i].implementation_step == ImplementationStep.STREAMING):
+                        if _boundary_id == wp.static(self.boundary_conditions[i].id):
+                            f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep, missing_mask, f_0, f_1, f_pre, f_post)
+                else:
+                    if wp.static(self.boundary_conditions[i].implementation_step == ImplementationStep.COLLISION):
+                        if _boundary_id == wp.static(self.boundary_conditions[i].id):
+                            f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep, missing_mask, f_0, f_1, f_pre, f_post)
+                    if wp.static(self.boundary_conditions[i].id in extrapolation_outflow_bc_ids):
+                        if _boundary_id == wp.static(self.boundary_conditions[i].id):
+                            f_result = wp.static(self.boundary_conditions[i].prepare_bc_auxilary_data)(
+                                index, timestep, missing_mask, f_0, f_1, f_pre, f_post
+                            )
+            return f_result
+
+        @wp.func
+        def neon_get_thread_data(
+            f0_pn: Any,
+            f1_pn: Any,
+            missing_mask_pn: Any,
+            index: Any,
+        ):
+            # Read thread data for populations
+            _f0_thread = _f_vec()
+            _f1_thread = _f_vec()
+            _missing_mask = _missing_mask_vec()
+            for l in range(self.velocity_set.q):
+                # q-sized vector of pre-streaming populations
+                _f0_thread[l] = self.compute_dtype(wp.neon_read(f0_pn, index, l))
+                _f1_thread[l] = self.compute_dtype(wp.neon_read(f1_pn, index, l))
+                _missing_mask[l] = wp.neon_read(missing_mask_pn, index, l)
+
+            return _f0_thread, _f1_thread, _missing_mask
+
+        import typing
+        @neon.Container.factory(name="finest_collide")
+        def single_step_finest(
+                level: int,
+                f_0_fd: Any,
+                f_1_fd: Any,
+                bc_mask_fd: Any,
+                missing_mask_fd: Any,
+                omega: Any,
+                timestep: int,
+        ):
+            # if level != 0:
+            #     # throw an exception
+            #     raise Exception("Only the finest level is supported for now")
+
+            # module op to define odd of even iteration
+            od_or_even = wp.module("odd_or_even", "even")
+
+            def ll_single_step_finest(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+
+                f_0_pn=loader.get_mres_read_handle(f_0_fd)
+                bc_mask_pn=loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn=loader.get_mres_read_handle(missing_mask_fd)
+
+                f_1_pn =loader.get_mres_write_handle(f_1_fd)
+
+                @wp.func
+                def cl_single_step_finest(index: typing.Any):
+                    _c = self.velocity_set.c
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    if _boundary_id == wp.uint8(255):
+                        return
+
+                    # Read thread data for populations, these are post streaming
+                    _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
+                    _f_post_stream = _f0_thread
+
+                    _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
+                    _feq = self.equilibrium.neon_functional(_rho, _u)
+                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
+
+                    # Apply post-collision boundary conditions
+                    _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False)
+
+                    # Apply streaming boundary conditions
+                    _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, True)
+                    _opposite_c_idx = self.velocity_set.self.opp_indices
+
+                    for l in range(self.velocity_set.q):
+                        push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]),
+                                                   wp.int8(_c[1, l]),
+                                                   wp.int8(_c[2, l]))
+                        ## Store
+                        if od_or_even == 0:
+                            wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, _f_post_stream[l])
+                        else:
+                            wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction,_f_post_stream[l])
+
+                        ## Push stream
+                        is_active = wp.neon_is_active(f_0_pn, index, push_direction)
+                        if is_active:
+                            ngh_gidx = wp.neon_ngh_idx(f_0_pn, index, push_direction)
+                            ngh_boundary_id = wp.neon_read(bc_mask_pn, ngh_gidx, 0)
+                            ## WHAT IS BULK?
+                            if ngh_boundary_id == BULK:
+                                wp.neon_write(f_1_pn, ngh_gidx, l, _f_post_stream[l])
+                            else:
+                                opposite_l = _opp_indices[l]
+                                wp.neon_write(f_1_pn, index, opposite_l, _f_post_stream[l])
+                        else:
+                            if wp.int8(_c[0, l]) != 0 and wp.int8(_c[1, l]) != 0 and wp.int8(_c[2, l]) != 0:
+                                opposite_l = _opp_indices[l]
+                                is_valid = False
+                                value = self.compute_dtype(0)
+                                if od_or_even == 0:
+                                    value = wp.neon_uncle_read(f_1_pn, index, push_direction, opposite_l, value, is_valid)
+                                else:
+                                    value = wp.neon_uncle_read(f_0_pn, index, push_direction, opposite_l, value, is_valid)
+                                if is_valid:
+                                    wp.neon_write(f_1_pn, index, l, _f_post_stream[l], value)
+
+                loader.declare_kernel(cl_single_step_finest)
+            return ll_single_step_finest
+
+        @neon.Container.factory(name="collide_coarse")
+        def collide_coarse(
+                level: int,
+                f_0_fd: Any,
+                f_1_fd: Any,
+                bc_mask_fd: Any,
+                missing_mask_fd: Any,
+                omega: Any,
+                timestep: int,
+        ):
+            num_levels = f_0_fd.get_grid().get_num_levels()
+
+            # module op to define odd of even iteration
+            even_itertation = wp.mod(timestep, 2)==0
+
+            def ll_collide_coarse(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+
+                f_0_pn=loader.get_mres_read_handle(f_0_fd)
+                bc_mask_pn=loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn=loader.get_mres_read_handle(missing_mask_fd)
+                f_1_pn =loader.get_mres_write_handle(f_1_fd)
+
+                _c = self.velocity_set.c
+
+                @wp.func
+                def cl_collide_coarse(index: typing.Any):
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    """
+                    The c++ version starts with the following, which I am not sure is right:
+                        if (type(cell, 0) == CellType::bulk ) {
+                    CB type cells should do collide too  
+                    """
+                    if _boundary_id == wp.uint8(255):
+                        return
+
+                    if not wp.neon_has_children(f_0_pn, index):
+
+                        # Read thread data for populations, these are post streaming
+                        _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
+                        _f_post_stream = _f0_thread
+
+                        _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
+                        _feq = self.equilibrium.neon_functional(_rho, _u)
+                        _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
+
+                        # Apply post-collision boundary conditions
+                        _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False)
+
+                        for l in range(self.velocity_set.q):
+                            push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
+                            if(level < num_levels - 1):
+                                ## Store
+                                # if even_itertation == 0:
+                                #     wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, _f_post_collision[l])
+                                # else:
+                                wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, _f_post_collision[l])
+
+                            wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+                    else:
+                        for l in range(self.velocity_set.q):
+                            wp.neon_write(f_1_pn, index, l, self.compute_dtype(0))
+                    wp.print("stream_coarse")
+
+                loader.declare_kernel(cl_collide_coarse)
+            return ll_collide_coarse
+
+        @neon.Container.factory(name="stream_coarse")
+        def stream_coarse(
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+            timestep: int,
+        ):
+            num_levels = f_0_fd.get_grid().get_num_levels()
+            # if level != 0:
+            #     # throw an exception
+            #     raise Exception("Only the finest level is supported for now")
+
+            # module op to define odd of even iteration
+            #od_or_even = wp.module("odd_or_even", "even")
+
+            def ll_stream_coarse(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+
+                f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                f_1_pn = loader.get_mres_write_handle(f_1_fd)
+
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
+
+                _c = self.velocity_set.c
+
+                @wp.func
+                def cl_stream_coarse(index: typing.Any):
+                    _missing_mask = _missing_mask_vec()
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    if _boundary_id != wp.uint8(255):
+                        #  if (!pin.hasChildren(cell)) {
+                        if not wp.neon_has_children(f_0_pn, index):
+                            # do stream normally
+                            _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
+                            _f_post_stream = self.stream.neon_functional(f_0_pn, index)
+
+                            # do mres corrections
+                            for l in range(self.velocity_set.q):
+                                pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
+                                _missing_mask[l] = wp.neon_read(missing_mask_pn, index, l)
+
+                                #  if (!pin.hasChildren(cell, dir)) {
+                                if not wp.neon_has_children(f_0_pn, index, pull_direction):
+                                    #if (nghType.mIsValid) {
+                                    # NOTHING as taken  care after
+                                    # } else if (pin.hasParent(cell) && !(dir.x == 0 && dir.y == 0 && dir.z == 0)) {
+                                    if wp.neon_has_parent(f_0_pn, index):
+                                        if pull_direction.x != 0 or pull_direction.y != 0 or pull_direction.z != 0:
+                                            # is_valid = wp.bool(False)
+                                            # uncle_val = wp.neon_uncle_read(f_0_pn, index, pull_direction, l, self.compute_dtype(0), is_valid)
+                                            # if is_valid:
+                                            #     #_f_post_stream[l] = uncle_val
+                                            #     # HERE DB
+                                            _f_post_stream[l] =  self.compute_dtype(0.0)
+                                else:
+                                    is_valid = wp.bool(False)
+                                    read_accumulate_date = wp.neon_ngh_data(f_1_pn, index, pull_direction, l, self.compute_dtype(0),is_valid)
+                                    if is_valid:
+                                        #_f_post_stream[l] = read_accumulate_date * self.compute_dtype(0.5)
+                                        # HERE DB
+                                        _f_post_stream[l] = self.compute_dtype(0.0)
+
+                            # do non mres post-streaming corrections
+                            _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_stream, True)
+
+                            for l in range(self.velocity_set.q):
+                                wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
+                    wp.print("stream_coarse")
+
+                loader.declare_kernel(cl_stream_coarse)
+
+            return ll_stream_coarse
+
+        return None, {
+            #"single_step_finest": single_step_finest,
+            "collide_coarse": collide_coarse,
+            "stream_coarse": stream_coarse}
+
+    def get_containers(self, target_level,  f_0, f_1, bc_mask, missing_mask,  omega, timestep):
+        containers = {'even': {}, 'odd': {}}
+        _, container = self._construct_neon()
+        for key in container.keys():
+            containers['odd'][key] = container[key](target_level, f_1, f_0, bc_mask, missing_mask, omega, 1)
+            containers['even'][key] = container[key](target_level, f_0, f_1, bc_mask, missing_mask, omega, 0)
+        return containers
+
+    def init_containers(self):
+        self.containers=None
+        _, self.containers = self._construct_neon()
+
+    def launch_container(self, streamId, op_name, mres_level, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
+        self.containers[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep).run(0)
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_launch(self, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
+        #if self.c is None:
+        #    self.c = self.neon_container(f_0, f_1, bc_mask, missing_mask, timestep)
+        # c = None
+        # if self.odd_or_even == 'even':
+        #     c = self.c_even
+        # else:
+        #     c = self.c_odd
+        #
+        # if c is None:
+        #     pass
+        c = self.neon_container(f_0, f_1, bc_mask, missing_mask, omega, timestep)
+        c.run(0)
+        #
+        # if self.odd_or_even == 'even':
+        #     c = self.c_even
+        # else:
+        #     c = self.c_odd
+        #
+        # if self.odd_or_even == 'even':
+        #     self.odd_or_even = 'odd'
+
+        return f_0, f_1

From 5fc8020cc45008847d66eeeb764bab65eda96147 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Tue, 22 Apr 2025 17:16:49 +0200
Subject: [PATCH 019/208] WIP

---
 xlb/operator/stepper/nse_multires_stepper.py    | 4 ++--
 xlb/operator/stepper/nse_multires_stepper_vk.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index cdfe854a..c5a8440f 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -353,7 +353,7 @@ def cl_collide_coarse(index: typing.Any):
                     else:
                         for l in range(self.velocity_set.q):
                             wp.neon_write(f_1_pn, index, l, self.compute_dtype(0))
-                    wp.print("stream_coarse")
+                    #wp.print("stream_coarse")
 
                 loader.declare_kernel(cl_collide_coarse)
             return ll_collide_coarse
@@ -412,7 +412,7 @@ def cl_stream_coarse(index: typing.Any):
 
                             for l in range(self.velocity_set.q):
                                 wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
-                    wp.print("stream_coarse")
+                    #wp.print("stream_coarse")
 
                 loader.declare_kernel(cl_stream_coarse)
 
diff --git a/xlb/operator/stepper/nse_multires_stepper_vk.py b/xlb/operator/stepper/nse_multires_stepper_vk.py
index bd3622a9..d8c8318a 100644
--- a/xlb/operator/stepper/nse_multires_stepper_vk.py
+++ b/xlb/operator/stepper/nse_multires_stepper_vk.py
@@ -429,7 +429,7 @@ def cl_stream_coarse(index: typing.Any):
 
                             for l in range(self.velocity_set.q):
                                 wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
-                    wp.print("stream_coarse")
+                    # wp.print("stream_coarse")
 
                 loader.declare_kernel(cl_stream_coarse)
 

From e1257c9181a90d9972fbb30cdbdecc5c32db7819 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Tue, 22 Apr 2025 17:19:50 +0200
Subject: [PATCH 020/208] WIP

---
 xlb/helper/nse_multires_solver.py            | 98 ++++++++++----------
 xlb/operator/stepper/nse_multires_stepper.py | 82 ----------------
 2 files changed, 49 insertions(+), 131 deletions(-)

diff --git a/xlb/helper/nse_multires_solver.py b/xlb/helper/nse_multires_solver.py
index a3a9bfa7..c2241286 100644
--- a/xlb/helper/nse_multires_solver.py
+++ b/xlb/helper/nse_multires_solver.py
@@ -85,20 +85,20 @@ def step(self):
         self.iteration_idx += 1
         iteration_id = self.iteration_idx % 2
 
-        # op_name, mres_level, f_0, f_1, bc_mask, missing_mask,  omega, timestep
-        ## LEVEL 1
-        if  self.count_levels == 2:
-            self.stepper.launch_container(
-            streamId= 0,
-            op_name="collide_coarse",
-            mres_level=1,
-            f_0=self.f_0,
-            f_1=self.f_1,
-            bc_mask=self.bc_mask,
-            missing_mask=self.missing_mask,
-            omega=self.omega,
-            timestep=iteration_id,
-            )
+        # # op_name, mres_level, f_0, f_1, bc_mask, missing_mask,  omega, timestep
+        # ## LEVEL 1
+        # if  self.count_levels == 2:
+        #     self.stepper.launch_container(
+        #     streamId= 0,
+        #     op_name="collide_coarse",
+        #     mres_level=1,
+        #     f_0=self.f_0,
+        #     f_1=self.f_1,
+        #     bc_mask=self.bc_mask,
+        #     missing_mask=self.missing_mask,
+        #     omega=self.omega,
+        #     timestep=iteration_id,
+        #     )
         ## LEVEL 0
         self.stepper.launch_container(
             streamId= 0,
@@ -122,40 +122,40 @@ def step(self):
             omega=self.omega,
             timestep=iteration_id,
         )
-        self.stepper.launch_container(
-                            streamId= 0,
-            op_name="collide_coarse",
-            mres_level=0,
-            f_0=self.f_0,
-            f_1=self.f_1,
-            bc_mask=self.bc_mask,
-            missing_mask=self.missing_mask,
-            omega=self.omega,
-            timestep=iteration_id,
-        )
-        self.stepper.launch_container(
-            streamId= 0,
-            op_name="stream_coarse",
-            mres_level=0,
-            f_0=self.f_1,
-            f_1=self.f_0,
-            bc_mask=self.bc_mask,
-            missing_mask=self.missing_mask,
-            omega=self.omega,
-            timestep=iteration_id,
-        )
-        # LEVEL 0
-        if  self.count_levels == 2:
-            self.stepper.launch_container(
-                streamId= 0,
-                op_name="collide_coarse",
-                mres_level=1,
-                f_0=self.f_1,
-                f_1=self.f_0,
-                bc_mask=self.bc_mask,
-                missing_mask=self.missing_mask,
-                omega=self.omega,
-                timestep=iteration_id,
-            )
+        # self.stepper.launch_container(
+        #                     streamId= 0,
+        #     op_name="collide_coarse",
+        #     mres_level=0,
+        #     f_0=self.f_0,
+        #     f_1=self.f_1,
+        #     bc_mask=self.bc_mask,
+        #     missing_mask=self.missing_mask,
+        #     omega=self.omega,
+        #     timestep=iteration_id,
+        # )
+        # self.stepper.launch_container(
+        #     streamId= 0,
+        #     op_name="stream_coarse",
+        #     mres_level=0,
+        #     f_0=self.f_1,
+        #     f_1=self.f_0,
+        #     bc_mask=self.bc_mask,
+        #     missing_mask=self.missing_mask,
+        #     omega=self.omega,
+        #     timestep=iteration_id,
+        # )
+        # # LEVEL 0
+        # if  self.count_levels == 2:
+        #     self.stepper.launch_container(
+        #         streamId= 0,
+        #         op_name="collide_coarse",
+        #         mres_level=1,
+        #         f_0=self.f_1,
+        #         f_1=self.f_0,
+        #         bc_mask=self.bc_mask,
+        #         missing_mask=self.missing_mask,
+        #         omega=self.omega,
+        #         timestep=iteration_id,
+        #     )
 
 
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index c5a8440f..d8a94c54 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -207,89 +207,7 @@ def neon_get_thread_data(
             return _f0_thread, _f1_thread, _missing_mask
 
         import typing
-        @neon.Container.factory(name="finest_collide")
-        def single_step_finest(
-                level: int,
-                f_0_fd: Any,
-                f_1_fd: Any,
-                bc_mask_fd: Any,
-                missing_mask_fd: Any,
-                omega: Any,
-                timestep: int,
-        ):
-            # if level != 0:
-            #     # throw an exception
-            #     raise Exception("Only the finest level is supported for now")
-
-            # module op to define odd of even iteration
-            od_or_even = wp.module("odd_or_even", "even")
-
-            def ll_single_step_finest(loader: neon.Loader):
-                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
-
-                f_0_pn=loader.get_mres_read_handle(f_0_fd)
-                bc_mask_pn=loader.get_mres_read_handle(bc_mask_fd)
-                missing_mask_pn=loader.get_mres_read_handle(missing_mask_fd)
-
-                f_1_pn =loader.get_mres_write_handle(f_1_fd)
-
-                @wp.func
-                def cl_single_step_finest(index: typing.Any):
-                    _c = self.velocity_set.c
-                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    if _boundary_id == wp.uint8(255):
-                        return
 
-                    # Read thread data for populations, these are post streaming
-                    _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
-                    _f_post_stream = _f0_thread
-
-                    _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
-                    _feq = self.equilibrium.neon_functional(_rho, _u)
-                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
-
-                    # Apply post-collision boundary conditions
-                    _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False)
-
-                    # Apply streaming boundary conditions
-                    _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, True)
-                    _opposite_c_idx = self.velocity_set.self.opp_indices
-
-                    for l in range(self.velocity_set.q):
-                        push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]),
-                                                   wp.int8(_c[1, l]),
-                                                   wp.int8(_c[2, l]))
-                        ## Store
-                        if od_or_even == 0:
-                            wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, _f_post_stream[l])
-                        else:
-                            wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction,_f_post_stream[l])
-
-                        ## Push stream
-                        is_active = wp.neon_is_active(f_0_pn, index, push_direction)
-                        if is_active:
-                            ngh_gidx = wp.neon_ngh_idx(f_0_pn, index, push_direction)
-                            ngh_boundary_id = wp.neon_read(bc_mask_pn, ngh_gidx, 0)
-                            ## WHAT IS BULK?
-                            if ngh_boundary_id == BULK:
-                                wp.neon_write(f_1_pn, ngh_gidx, l, _f_post_stream[l])
-                            else:
-                                opposite_l = _opp_indices[l]
-                                wp.neon_write(f_1_pn, index, opposite_l, _f_post_stream[l])
-                        else:
-                            if wp.int8(_c[0, l]) != 0 and wp.int8(_c[1, l]) != 0 and wp.int8(_c[2, l]) != 0:
-                                opposite_l = _opp_indices[l]
-                                is_valid = False
-                                value = self.compute_dtype(0)
-                                if od_or_even == 0:
-                                    value = wp.neon_uncle_read(f_1_pn, index, push_direction, opposite_l, value, is_valid)
-                                else:
-                                    value = wp.neon_uncle_read(f_0_pn, index, push_direction, opposite_l, value, is_valid)
-                                if is_valid:
-                                    wp.neon_write(f_1_pn, index, l, _f_post_stream[l], value)
-
-                loader.declare_kernel(cl_single_step_finest)
-            return ll_single_step_finest
 
         @neon.Container.factory(name="collide_coarse")
         def collide_coarse(

From 57e0b11f7ccf2d90b3a914f8158daa36145ee1b3 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Wed, 23 Apr 2025 15:23:38 +0200
Subject: [PATCH 021/208] Single level working, multi-level runtime error: not
 enough resources

---
 .../performance/mlups_3d_multires_solver.py   |  3 +-
 .../mlups_3d_multires_solver_single_level.py  |  2 +-
 xlb/helper/nse_multires_solver.py             | 98 +++++++++----------
 xlb/operator/stepper/nse_multires_stepper.py  | 20 +++-
 4 files changed, 67 insertions(+), 56 deletions(-)

diff --git a/examples/performance/mlups_3d_multires_solver.py b/examples/performance/mlups_3d_multires_solver.py
index 6731725b..bbf66af4 100644
--- a/examples/performance/mlups_3d_multires_solver.py
+++ b/examples/performance/mlups_3d_multires_solver.py
@@ -131,7 +131,8 @@ def peel(dim, idx, peel_level, outwards):
     # Create stepper
     stepper = MultiresIncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="BGK")
 
-    Re = 10000.0
+    Re = 100.0
+
     clength = grid_shape[0] - 1
     visc = prescribed_vel * clength / Re
     omega = 1.0 / (3.0 * visc + 0.5)
diff --git a/examples/performance/mlups_3d_multires_solver_single_level.py b/examples/performance/mlups_3d_multires_solver_single_level.py
index 73095630..3d7aaee8 100644
--- a/examples/performance/mlups_3d_multires_solver_single_level.py
+++ b/examples/performance/mlups_3d_multires_solver_single_level.py
@@ -108,7 +108,7 @@ def peel(dim, idx, peel_level, outwards):
     # Create stepper
     stepper = MultiresIncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="BGK")
 
-    Re = 10000.0
+    Re = 100.0
     clength = grid_shape[0] - 1
     visc = prescribed_vel * clength / Re
     omega = 1.0 / (3.0 * visc + 0.5)
diff --git a/xlb/helper/nse_multires_solver.py b/xlb/helper/nse_multires_solver.py
index c2241286..b0265567 100644
--- a/xlb/helper/nse_multires_solver.py
+++ b/xlb/helper/nse_multires_solver.py
@@ -85,20 +85,20 @@ def step(self):
         self.iteration_idx += 1
         iteration_id = self.iteration_idx % 2
 
-        # # op_name, mres_level, f_0, f_1, bc_mask, missing_mask,  omega, timestep
-        # ## LEVEL 1
-        # if  self.count_levels == 2:
-        #     self.stepper.launch_container(
-        #     streamId= 0,
-        #     op_name="collide_coarse",
-        #     mres_level=1,
-        #     f_0=self.f_0,
-        #     f_1=self.f_1,
-        #     bc_mask=self.bc_mask,
-        #     missing_mask=self.missing_mask,
-        #     omega=self.omega,
-        #     timestep=iteration_id,
-        #     )
+        # op_name, mres_level, f_0, f_1, bc_mask, missing_mask,  omega, timestep
+        ## LEVEL 1
+        if  self.count_levels == 2:
+            self.stepper.launch_container(
+            streamId= 0,
+            op_name="collide_coarse",
+            mres_level=1,
+            f_0=self.f_0,
+            f_1=self.f_1,
+            bc_mask=self.bc_mask,
+            missing_mask=self.missing_mask,
+            omega=self.omega,
+            timestep=iteration_id,
+            )
         ## LEVEL 0
         self.stepper.launch_container(
             streamId= 0,
@@ -122,40 +122,40 @@ def step(self):
             omega=self.omega,
             timestep=iteration_id,
         )
-        # self.stepper.launch_container(
-        #                     streamId= 0,
-        #     op_name="collide_coarse",
-        #     mres_level=0,
-        #     f_0=self.f_0,
-        #     f_1=self.f_1,
-        #     bc_mask=self.bc_mask,
-        #     missing_mask=self.missing_mask,
-        #     omega=self.omega,
-        #     timestep=iteration_id,
-        # )
-        # self.stepper.launch_container(
-        #     streamId= 0,
-        #     op_name="stream_coarse",
-        #     mres_level=0,
-        #     f_0=self.f_1,
-        #     f_1=self.f_0,
-        #     bc_mask=self.bc_mask,
-        #     missing_mask=self.missing_mask,
-        #     omega=self.omega,
-        #     timestep=iteration_id,
-        # )
-        # # LEVEL 0
-        # if  self.count_levels == 2:
-        #     self.stepper.launch_container(
-        #         streamId= 0,
-        #         op_name="collide_coarse",
-        #         mres_level=1,
-        #         f_0=self.f_1,
-        #         f_1=self.f_0,
-        #         bc_mask=self.bc_mask,
-        #         missing_mask=self.missing_mask,
-        #         omega=self.omega,
-        #         timestep=iteration_id,
-        #     )
+        self.stepper.launch_container(
+                            streamId= 0,
+            op_name="collide_coarse",
+            mres_level=0,
+            f_0=self.f_0,
+            f_1=self.f_1,
+            bc_mask=self.bc_mask,
+            missing_mask=self.missing_mask,
+            omega=self.omega,
+            timestep=iteration_id,
+        )
+        self.stepper.launch_container(
+            streamId= 0,
+            op_name="stream_coarse",
+            mres_level=0,
+            f_0=self.f_1,
+            f_1=self.f_0,
+            bc_mask=self.bc_mask,
+            missing_mask=self.missing_mask,
+            omega=self.omega,
+            timestep=iteration_id,
+        )
+        # LEVEL 0
+        if  self.count_levels == 2:
+            self.stepper.launch_container(
+                streamId= 0,
+                op_name="stream_coarse",
+                mres_level=1,
+                f_0=self.f_1,
+                f_1=self.f_0,
+                bc_mask=self.bc_mask,
+                missing_mask=self.missing_mask,
+                omega=self.omega,
+                timestep=iteration_id,
+            )
 
 
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index d8a94c54..c43a122c 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -312,21 +312,31 @@ def cl_stream_coarse(index: typing.Any):
                     if _boundary_id != wp.uint8(255):
                         if not wp.neon_has_children(f_0_pn, index):
                             # do stream normally
-                            _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
+                            _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn,
+                                                                                         f_1_pn,
+                                                                                         missing_mask_pn,
+                                                                                         index)
+                            _f_post_collision = _f0_thread
                             _f_post_stream = self.stream.neon_functional(f_0_pn, index)
 
-                            # do mres corrections
+                            #do mres corrections
                             for l in range(self.velocity_set.q):
-                                pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
+                                pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]),
+                                                                 wp.int8(-_c[1, l]),
+                                                                 wp.int8(-_c[2, l]))
                                 _missing_mask[l] = wp.neon_read(missing_mask_pn, index, l)
                                 if wp.neon_has_children(f_0_pn, index, pull_direction):
                                     is_valid = wp.bool(False)
                                     read_accumulate_date = wp.neon_ngh_data(f_1_pn, index, pull_direction, l, self.compute_dtype(0),is_valid)
                                     if is_valid:
-                                        _f_post_stream[l] = read_accumulate_date #read_accumulate_date * self.compute_dtype(0.5)
+                                        _f_post_stream[l] = read_accumulate_date * self.compute_dtype(0.5)
 
                             # do non mres post-streaming corrections
-                            _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_stream, True)
+                            _f_post_stream = apply_bc(index, timestep,
+                                                      _boundary_id, _missing_mask,
+                                                      f_0_pn, f_1_pn,
+                                                      _f_post_collision, _f_post_stream,
+                                                      True)
 
                             for l in range(self.velocity_set.q):
                                 wp.neon_write(f_1_pn, index, l, _f_post_stream[l])

From c126568bdccc0e0ba577918053a43c658ecff897 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 24 Apr 2025 16:02:16 +0200
Subject: [PATCH 022/208] WIP: recursive

---
 xlb/helper/nse_multires_solver.py            | 192 ++++++++++++-------
 xlb/operator/stepper/nse_multires_stepper.py | 130 +++++++++++--
 2 files changed, 244 insertions(+), 78 deletions(-)

diff --git a/xlb/helper/nse_multires_solver.py b/xlb/helper/nse_multires_solver.py
index b0265567..9696babd 100644
--- a/xlb/helper/nse_multires_solver.py
+++ b/xlb/helper/nse_multires_solver.py
@@ -82,74 +82,60 @@ def export_macroscopic(self, fname_prefix):
 
     # one step at the corase level
     def step(self):
-        self.iteration_idx += 1
-        iteration_id = self.iteration_idx % 2
 
-        # op_name, mres_level, f_0, f_1, bc_mask, missing_mask,  omega, timestep
-        ## LEVEL 1
-        if  self.count_levels == 2:
+        def recurtion(level):
+            if level < 0:
+                return
+            print(f"RECURTION down to level {level}")
+            print(f"RECURTION Level {level}, COLLIDE")
+
             self.stepper.launch_container(
-            streamId= 0,
-            op_name="collide_coarse",
-            mres_level=1,
-            f_0=self.f_0,
-            f_1=self.f_1,
-            bc_mask=self.bc_mask,
-            missing_mask=self.missing_mask,
-            omega=self.omega,
-            timestep=iteration_id,
+                streamId=0,
+                op_name="collide_coarse",
+                mres_level=level,
+                f_0=self.f_0,
+                f_1=self.f_1,
+                bc_mask=self.bc_mask,
+                missing_mask=self.missing_mask,
+                omega=self.omega,
+                timestep=iteration_id,
             )
-        ## LEVEL 0
-        self.stepper.launch_container(
-            streamId= 0,
-            op_name="collide_coarse",
-            mres_level=0,
-            f_0=self.f_0,
-            f_1=self.f_1,
-            bc_mask=self.bc_mask,
-            missing_mask=self.missing_mask,
-            omega=self.omega,
-            timestep=iteration_id,
-        )
-        self.stepper.launch_container(
-                            streamId= 0,
-            op_name="stream_coarse",
-            mres_level=0,
-            f_0=self.f_1,
-            f_1=self.f_0,
-            bc_mask=self.bc_mask,
-            missing_mask=self.missing_mask,
-            omega=self.omega,
-            timestep=iteration_id,
-        )
-        self.stepper.launch_container(
-                            streamId= 0,
-            op_name="collide_coarse",
-            mres_level=0,
-            f_0=self.f_0,
-            f_1=self.f_1,
-            bc_mask=self.bc_mask,
-            missing_mask=self.missing_mask,
-            omega=self.omega,
-            timestep=iteration_id,
-        )
-        self.stepper.launch_container(
-            streamId= 0,
-            op_name="stream_coarse",
-            mres_level=0,
-            f_0=self.f_1,
-            f_1=self.f_0,
-            bc_mask=self.bc_mask,
-            missing_mask=self.missing_mask,
-            omega=self.omega,
-            timestep=iteration_id,
-        )
-        # LEVEL 0
-        if  self.count_levels == 2:
+
+            recurtion(level-1)
+            recurtion(level-1)
+
+            print(f"RECURTION Level {level}, stream_coarse_step_A")
+            self.stepper.launch_container(
+                streamId=0,
+                op_name="stream_coarse_step_A",
+                mres_level=level,
+                f_0=self.f_1,
+                f_1=self.f_0,
+                bc_mask=self.bc_mask,
+                missing_mask=self.missing_mask,
+                omega=self.omega,
+                timestep=iteration_id,
+            )
+            print(f"RECURTION Level {level}, stream_coarse_step_B")
+
+            self.stepper.launch_container(
+                streamId=0,
+                op_name="stream_coarse_step_B",
+                mres_level=level,
+                f_0=self.f_1,
+                f_1=self.f_0,
+                bc_mask=self.bc_mask,
+                missing_mask=self.missing_mask,
+                omega=self.omega,
+                timestep=iteration_id,
+            )
+
+            print(f"RECURTION Level {level}, stream_coarse_step_C")
+
             self.stepper.launch_container(
-                streamId= 0,
-                op_name="stream_coarse",
-                mres_level=1,
+                streamId=0,
+                op_name="stream_coarse_step_C",
+                mres_level=level,
                 f_0=self.f_1,
                 f_1=self.f_0,
                 bc_mask=self.bc_mask,
@@ -158,4 +144,82 @@ def step(self):
                 timestep=iteration_id,
             )
 
+        self.iteration_idx += 1
+        iteration_id = self.iteration_idx % 2
+
+        recurtion(self.count_levels-1)
+
+        # # op_name, mres_level, f_0, f_1, bc_mask, missing_mask,  omega, timestep
+        # ## LEVEL 1
+        # if  self.count_levels == 2:
+        #     self.stepper.launch_container(
+        #     streamId= 0,
+        #     op_name="collide_coarse",
+        #     mres_level=1,
+        #     f_0=self.f_0,
+        #     f_1=self.f_1,
+        #     bc_mask=self.bc_mask,
+        #     missing_mask=self.missing_mask,
+        #     omega=self.omega,
+        #     timestep=iteration_id,
+        #     )
+        # ## LEVEL 0
+        # self.stepper.launch_container(
+        #     streamId= 0,
+        #     op_name="collide_coarse",
+        #     mres_level=0,
+        #     f_0=self.f_0,
+        #     f_1=self.f_1,
+        #     bc_mask=self.bc_mask,
+        #     missing_mask=self.missing_mask,
+        #     omega=self.omega,
+        #     timestep=iteration_id,
+        # )
+        # self.stepper.launch_container(
+        #                     streamId= 0,
+        #     op_name="stream_coarse",
+        #     mres_level=0,
+        #     f_0=self.f_1,
+        #     f_1=self.f_0,
+        #     bc_mask=self.bc_mask,
+        #     missing_mask=self.missing_mask,
+        #     omega=self.omega,
+        #     timestep=iteration_id,
+        # )
+        # self.stepper.launch_container(
+        #                     streamId= 0,
+        #     op_name="collide_coarse",
+        #     mres_level=0,
+        #     f_0=self.f_0,
+        #     f_1=self.f_1,
+        #     bc_mask=self.bc_mask,
+        #     missing_mask=self.missing_mask,
+        #     omega=self.omega,
+        #     timestep=iteration_id,
+        # )
+        # self.stepper.launch_container(
+        #     streamId= 0,
+        #     op_name="stream_coarse",
+        #     mres_level=0,
+        #     f_0=self.f_1,
+        #     f_1=self.f_0,
+        #     bc_mask=self.bc_mask,
+        #     missing_mask=self.missing_mask,
+        #     omega=self.omega,
+        #     timestep=iteration_id,
+        # )
+        # # LEVEL 0
+        # if  self.count_levels == 2:
+        #     self.stepper.launch_container(
+        #         streamId= 0,
+        #         op_name="stream_coarse",
+        #         mres_level=1,
+        #         f_0=self.f_1,
+        #         f_1=self.f_0,
+        #         bc_mask=self.bc_mask,
+        #         missing_mask=self.missing_mask,
+        #         omega=self.omega,
+        #         timestep=iteration_id,
+        #     )
+
 
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index c43a122c..c4d728b4 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -266,18 +266,21 @@ def cl_collide_coarse(index: typing.Any):
                                 #     wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, _f_post_collision[l])
                                 # else:
                                 wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, _f_post_collision[l])
+                                wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, _f_post_collision[l])
 
                             wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
                     else:
                         for l in range(self.velocity_set.q):
                             wp.neon_write(f_1_pn, index, l, self.compute_dtype(0))
+                            wp.neon_write(f_0_pn, index, l, self.compute_dtype(0))
+
                     #wp.print("stream_coarse")
 
                 loader.declare_kernel(cl_collide_coarse)
             return ll_collide_coarse
 
         @neon.Container.factory(name="stream_coarse")
-        def stream_coarse(
+        def stream_coarse_step_A(
             level: int,
             f_0_fd: Any,
             f_1_fd: Any,
@@ -319,28 +322,125 @@ def cl_stream_coarse(index: typing.Any):
                             _f_post_collision = _f0_thread
                             _f_post_stream = self.stream.neon_functional(f_0_pn, index)
 
+                            for l in range(self.velocity_set.q):
+                                wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
+                    #wp.print("stream_coarse")
+
+                loader.declare_kernel(cl_stream_coarse)
+
+            return ll_stream_coarse
+
+        @neon.Container.factory(name="stream_coarse")
+        def stream_coarse_step_B(
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+            timestep: int,
+        ):
+            num_levels = f_0_fd.get_grid().get_num_levels()
+            # if level != 0:
+            #     # throw an exception
+            #     raise Exception("Only the finest level is supported for now")
+
+            # module op to define odd of even iteration
+            #od_or_even = wp.module("odd_or_even", "even")
+
+            def ll_stream_coarse(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+
+                f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                f_1_pn = loader.get_mres_write_handle(f_1_fd)
+
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
+
+                _c = self.velocity_set.c
+
+                @wp.func
+                def cl_stream_coarse(index: typing.Any):
+                    _missing_mask = _missing_mask_vec()
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    if _boundary_id != wp.uint8(255):
+                        if not wp.neon_has_children(f_0_pn, index):
                             #do mres corrections
+                            is_valid = wp.bool(False)
                             for l in range(self.velocity_set.q):
+                                if l == 9:
+                                    continue
                                 pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]),
                                                                  wp.int8(-_c[1, l]),
                                                                  wp.int8(-_c[2, l]))
-                                _missing_mask[l] = wp.neon_read(missing_mask_pn, index, l)
-                                if wp.neon_has_children(f_0_pn, index, pull_direction):
-                                    is_valid = wp.bool(False)
-                                    read_accumulate_date = wp.neon_ngh_data(f_1_pn, index, pull_direction, l, self.compute_dtype(0),is_valid)
-                                    if is_valid:
-                                        _f_post_stream[l] = read_accumulate_date * self.compute_dtype(0.5)
+                                # if (!pin.hasChildren(cell, dir)) {
+                                if not wp.neon_has_children(f_0_pn, index, pull_direction):
+                                    # read value for now, but a is_valid_ngh would be fine
+                                    wp.neon_ngh_data(f_1_pn, index, pull_direction, l, self.compute_dtype(0),is_valid)
+                                    # } else if (pin.hasParent(cell) && !(dir.x == 0 && dir.y == 0 && dir.z == 0)) {
+                                    if not is_valid:
+                                        if wp.neon_has_parent(f_0_pn, index):
+                                            is_valid = wp.bool(False)
+                                            uncle_val = wp.neon_uncle_read(f_0_pn, index, pull_direction, l, self.compute_dtype(0), is_valid)
+                                            if is_valid:
+                                                wp.neon_write(f_1_pn, index, l, uncle_val)
+
+                                # else:
+                                #     read_accumulate_date = wp.neon_ngh_data(f_1_pn, index, pull_direction, l, self.compute_dtype(0),is_valid)
+                                #     if is_valid:
+                                #         val = read_accumulate_date * self.compute_dtype(0.5)
+                                #         wp.neon_write(f_1_pn, index, l, val)
+
+
+                loader.declare_kernel(cl_stream_coarse)
+
+            return ll_stream_coarse
+
+        @neon.Container.factory(name="stream_coarse")
+        def stream_coarse_step_C(
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+            timestep: int,
+        ):
+
+            def ll_stream_coarse(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+
+                f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                f_1_pn = loader.get_mres_write_handle(f_1_fd)
+
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
+
+                _c = self.velocity_set.c
+
+                @wp.func
+                def cl_stream_coarse(index: typing.Any):
+                    _missing_mask = _missing_mask_vec()
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    if _boundary_id != wp.uint8(255):
+                        if not wp.neon_has_children(f_0_pn, index):
+                            # do stream normally
+                            _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
+                            _f_post_collision = _f0_thread
+                            _f_post_stream = _f1_thread
+
 
                             # do non mres post-streaming corrections
-                            _f_post_stream = apply_bc(index, timestep,
-                                                      _boundary_id, _missing_mask,
-                                                      f_0_pn, f_1_pn,
-                                                      _f_post_collision, _f_post_stream,
-                                                      True)
+                            _f_post_stream = apply_bc(
+                                index, timestep,
+                                _boundary_id,
+                                _missing_mask,
+                                f_0_pn, f_1_pn,
+                                _f_post_collision, _f_post_stream, True
+                            )
 
                             for l in range(self.velocity_set.q):
                                 wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
-                    #wp.print("stream_coarse")
 
                 loader.declare_kernel(cl_stream_coarse)
 
@@ -349,7 +449,9 @@ def cl_stream_coarse(index: typing.Any):
         return None, {
             #"single_step_finest": single_step_finest,
             "collide_coarse": collide_coarse,
-            "stream_coarse": stream_coarse}
+            "stream_coarse_step_A": stream_coarse_step_A,
+            "stream_coarse_step_B": stream_coarse_step_B,
+            "stream_coarse_step_C": stream_coarse_step_C}
 
     def get_containers(self, target_level,  f_0, f_1, bc_mask, missing_mask,  omega, timestep):
         containers = {'even': {}, 'odd': {}}

From 58b7f7bc920d3b14d74cbdfa76c62317e8068b31 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Sat, 26 Apr 2025 13:51:56 +0200
Subject: [PATCH 023/208] Refactoring.

---
 xlb/helper/nse_multires_solver.py             | 25 +++++
 .../mulltires_quadratic_equilibrium.py        |  2 +-
 .../macroscopic/multires_macroscopic.py       |  2 +-
 xlb/operator/stepper/nse_multires_stepper.py  | 93 ++++++++++++-------
 xlb/operator/stream/stream.py                 |  2 +-
 5 files changed, 89 insertions(+), 35 deletions(-)

diff --git a/xlb/helper/nse_multires_solver.py b/xlb/helper/nse_multires_solver.py
index 9696babd..d821e7d1 100644
--- a/xlb/helper/nse_multires_solver.py
+++ b/xlb/helper/nse_multires_solver.py
@@ -100,6 +100,18 @@ def recurtion(level):
                 omega=self.omega,
                 timestep=iteration_id,
             )
+            # if(level == 0):
+            #     wp.synchronize()
+            #     self.f_0.update_host(0)
+            #     self.f_1.update_host(0)
+            #     wp.synchronize()
+            #     self.f_0.export_vti(f"pop_0_", "pop_0")
+            #     self.f_1.export_vti(f"pop_1_", "pop_1")
+            #     # exit
+            #     import sys
+            #     print("exit")
+            #     #sys.exit()
+            #     pass
 
             recurtion(level-1)
             recurtion(level-1)
@@ -143,6 +155,19 @@ def recurtion(level):
                 omega=self.omega,
                 timestep=iteration_id,
             )
+            # if(level == 1):
+            #     wp.synchronize()
+            #     self.f_0.update_host(0)
+            #     self.f_1.update_host(0)
+            #     wp.synchronize()
+            #     self.f_0.export_vti(f"pop_0_qq", "pop_0")
+            #     self.f_1.export_vti(f"pop_1_qq", "pop_1")
+            #     # exit
+            #     import sys
+            #     print("exit")
+            #     sys.exit()
+            #     pass
+
 
         self.iteration_idx += 1
         iteration_id = self.iteration_idx % 2
diff --git a/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py b/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
index 1ce28674..f7ac3e79 100644
--- a/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
+++ b/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
@@ -81,7 +81,7 @@ def quadratic_equilibrium_cl(index: typing.Any):
                     _rho = wp.neon_read(rho_pn, index, 0)
                     feq = functional(_rho, _u)
 
-                    if wp.neon_has_children(f_pn, index):
+                    if wp.neon_has_child(f_pn, index):
                         for l in range(self.velocity_set.q):
                             feq[l] = self.compute_dtype(0.0)
                     # Set the output
diff --git a/xlb/operator/macroscopic/multires_macroscopic.py b/xlb/operator/macroscopic/multires_macroscopic.py
index 18d7a925..e57b8e21 100644
--- a/xlb/operator/macroscopic/multires_macroscopic.py
+++ b/xlb/operator/macroscopic/multires_macroscopic.py
@@ -107,7 +107,7 @@ def macroscopic_cl(gIdx: typing.Any):
                         for d in range(_d):
                             _u[d] = self.compute_dtype(0.0)
 
-                    if _boundary_id == wp.uint8(255) or wp.neon_has_children(f, gIdx):
+                    if _boundary_id == wp.uint8(255) or wp.neon_has_child(f, gIdx):
                         _rho = self.compute_dtype(0.0)
                         for d in range(_d):
                             _u[d] = self.compute_dtype(0.0)
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index c4d728b4..ff0140ce 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -233,6 +233,7 @@ def ll_collide_coarse(loader: neon.Loader):
                 f_1_pn =loader.get_mres_write_handle(f_1_fd)
 
                 _c = self.velocity_set.c
+                _w = self.velocity_set.w
 
                 @wp.func
                 def cl_collide_coarse(index: typing.Any):
@@ -245,7 +246,7 @@ def cl_collide_coarse(index: typing.Any):
                     if _boundary_id == wp.uint8(255):
                         return
 
-                    if not wp.neon_has_children(f_0_pn, index):
+                    if not wp.neon_has_child(f_0_pn, index):
 
                         # Read thread data for populations, these are post streaming
                         _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
@@ -256,7 +257,7 @@ def cl_collide_coarse(index: typing.Any):
                         _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
 
                         # Apply post-collision boundary conditions
-                        _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False)
+                        #_f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False)
 
                         for l in range(self.velocity_set.q):
                             push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
@@ -265,8 +266,10 @@ def cl_collide_coarse(index: typing.Any):
                                 # if even_itertation == 0:
                                 #     wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, _f_post_collision[l])
                                 # else:
-                                wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, _f_post_collision[l])
-                                wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, _f_post_collision[l])
+                                val = _f_post_collision[l]
+                                #val = self.compute_dtype(1.0)
+                                wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, val)
+                                wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, val)
 
                             wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
                     else:
@@ -313,7 +316,7 @@ def cl_stream_coarse(index: typing.Any):
                     _missing_mask = _missing_mask_vec()
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id != wp.uint8(255):
-                        if not wp.neon_has_children(f_0_pn, index):
+                        if not wp.neon_has_child(f_0_pn, index):
                             # do stream normally
                             _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn,
                                                                                          f_1_pn,
@@ -358,38 +361,64 @@ def ll_stream_coarse(loader: neon.Loader):
                 missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
 
                 _c = self.velocity_set.c
+                _w = self.velocity_set.w
 
                 @wp.func
                 def cl_stream_coarse(index: typing.Any):
-                    _missing_mask = _missing_mask_vec()
+                    # _missing_mask = _missing_mask_vec()
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id != wp.uint8(255):
-                        if not wp.neon_has_children(f_0_pn, index):
-                            #do mres corrections
-                            is_valid = wp.bool(False)
-                            for l in range(self.velocity_set.q):
-                                if l == 9:
-                                    continue
-                                pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]),
-                                                                 wp.int8(-_c[1, l]),
-                                                                 wp.int8(-_c[2, l]))
-                                # if (!pin.hasChildren(cell, dir)) {
-                                if not wp.neon_has_children(f_0_pn, index, pull_direction):
-                                    # read value for now, but a is_valid_ngh would be fine
-                                    wp.neon_ngh_data(f_1_pn, index, pull_direction, l, self.compute_dtype(0),is_valid)
-                                    # } else if (pin.hasParent(cell) && !(dir.x == 0 && dir.y == 0 && dir.z == 0)) {
-                                    if not is_valid:
-                                        if wp.neon_has_parent(f_0_pn, index):
-                                            is_valid = wp.bool(False)
-                                            uncle_val = wp.neon_uncle_read(f_0_pn, index, pull_direction, l, self.compute_dtype(0), is_valid)
-                                            if is_valid:
-                                                wp.neon_write(f_1_pn, index, l, uncle_val)
+                        return
 
-                                # else:
-                                #     read_accumulate_date = wp.neon_ngh_data(f_1_pn, index, pull_direction, l, self.compute_dtype(0),is_valid)
-                                #     if is_valid:
-                                #         val = read_accumulate_date * self.compute_dtype(0.5)
-                                #         wp.neon_write(f_1_pn, index, l, val)
+                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
+                    if are_we_a_halo_cell:
+                        # HERE: we are a halo cell so we just exit
+                        return
+                    for l in range(self.velocity_set.q):
+                        if l == 9:
+                            # HERE, we skip the center direction
+                            continue
+
+                        pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]),
+                                                         wp.int8(-_c[1, l]),
+                                                         wp.int8(-_c[2, l]))
+
+                        has_ngh_at_same_level = wp.bool(False)
+                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(55555), has_ngh_at_same_level)
+
+                        # if (!pin.hasChildren(cell, dir)) {
+                        if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
+                            # NO finer ngh. in the pull direction (opposite of l)
+                            if not has_ngh_at_same_level:
+                                # NO ngh. at the same level
+                                # COULD we have a ngh. at the courser level?
+                                if wp.neon_has_parent(f_0_pn, index):
+                                    # YES ghost cell on top of us
+                                    has_a_courser_ngh = wp.bool(False)
+                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_courser_ngh)
+                                    if has_a_courser_ngh:
+                                        # Full state:
+                                        # NO finer ngh. in the pull direction (opposite of l)
+                                        # NO ngh. at the same level
+                                        # YES ghost cell on top of us
+                                        # YES courser ngh.
+                                        # -> **Explosion**
+                                        wp.neon_write(f_1_pn, index, l, exploded_pop)
+                        else:
+                            # HERE -> I have a finer neigh. in direction pull (opposite l)
+                            # Then I have to read from the halo on top of my finer neigh.
+                            if has_ngh_at_same_level:
+                                if l == 10:
+                                    wp.print(accumulated)
+                                # accumulated = _w[l]
+                                # Full State
+                                # YES finer ngh. in the pull direction (opposite of l)
+                                # YES ngh. at the same level
+                                # -> **Coalescence**
+                                accumulated = accumulated / self.compute_dtype(16)
+                                wp.neon_write(f_1_pn, index, l, accumulated)
+                            else:
+                                wp.print("ERRRRRRORRRRRRRRRRRRRR")
 
 
                 loader.declare_kernel(cl_stream_coarse)
@@ -423,7 +452,7 @@ def cl_stream_coarse(index: typing.Any):
                     _missing_mask = _missing_mask_vec()
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id != wp.uint8(255):
-                        if not wp.neon_has_children(f_0_pn, index):
+                        if not wp.neon_has_child(f_0_pn, index):
                             # do stream normally
                             _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
                             _f_post_collision = _f0_thread
diff --git a/xlb/operator/stream/stream.py b/xlb/operator/stream/stream.py
index 0cac3c6a..2c6a0bc7 100644
--- a/xlb/operator/stream/stream.py
+++ b/xlb/operator/stream/stream.py
@@ -137,7 +137,7 @@ def functional(
                                       wp.int8(-_c[2, l]))
 
                 unused_is_valid = wp.bool(False)
-                _f[l] = wp.neon_ngh_data(f, index, ngh, l, self.compute_dtype(0), unused_is_valid)
+                _f[l] = wp.neon_read_ngh(f, index, ngh, l, self.compute_dtype(0), unused_is_valid)
 
             return _f
 

From 07fc7ae49036b7abb0ca941f7896d92693a7fe15 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Wed, 30 Apr 2025 22:18:14 +0200
Subject: [PATCH 024/208] WIP

---
 .../performance/mlups_3d_multires_solver.py   |   2 +-
 xlb/operator/stepper/nse_multires_stepper.py  | 102 +++++++++++-------
 .../stepper/nse_multires_stepper_vk.py        |  14 +--
 xlb/velocity_set/d3q19.py                     |   2 +
 4 files changed, 72 insertions(+), 48 deletions(-)

diff --git a/examples/performance/mlups_3d_multires_solver.py b/examples/performance/mlups_3d_multires_solver.py
index bbf66af4..601e9491 100644
--- a/examples/performance/mlups_3d_multires_solver.py
+++ b/examples/performance/mlups_3d_multires_solver.py
@@ -94,7 +94,7 @@ def peel(dim, idx, peel_level, outwards):
             for k in range(dim.z):
                 idx = neon.Index_3d(i,j,k)
                 val = 0
-                if peel(dim, idx, 20, True):
+                if peel(dim, idx, 4, True):
                     val = 1
                 level_zero_mask[i, j, k] = val
 
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index ff0140ce..a4301a81 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -260,13 +260,16 @@ def cl_collide_coarse(index: typing.Any):
                         #_f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False)
 
                         for l in range(self.velocity_set.q):
-                            push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
+                            push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]),
+                                                             wp.int8(_c[1, l]),
+                                                             wp.int8(_c[2, l]))
                             if(level < num_levels - 1):
                                 ## Store
                                 # if even_itertation == 0:
                                 #     wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, _f_post_collision[l])
                                 # else:
                                 val = _f_post_collision[l]
+                                val = self.compute_dtype(11)
                                 #val = self.compute_dtype(1.0)
                                 wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, val)
                                 wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, val)
@@ -313,20 +316,26 @@ def ll_stream_coarse(loader: neon.Loader):
 
                 @wp.func
                 def cl_stream_coarse(index: typing.Any):
-                    _missing_mask = _missing_mask_vec()
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    if _boundary_id != wp.uint8(255):
-                        if not wp.neon_has_child(f_0_pn, index):
-                            # do stream normally
-                            _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn,
-                                                                                         f_1_pn,
-                                                                                         missing_mask_pn,
-                                                                                         index)
-                            _f_post_collision = _f0_thread
-                            _f_post_stream = self.stream.neon_functional(f_0_pn, index)
-
-                            for l in range(self.velocity_set.q):
-                                wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
+                    if _boundary_id == wp.uint8(255):
+                        return
+
+                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
+                    if are_we_a_halo_cell:
+                        # HERE: we are a halo cell so we just exit
+                        return
+
+                    # do stream normally
+                    _missing_mask = _missing_mask_vec()
+                    _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn,
+                                                                                 f_1_pn,
+                                                                                 missing_mask_pn,
+                                                                                 index)
+                    _f_post_collision = _f0_thread
+                    _f_post_stream = self.stream.neon_functional(f_0_pn, index)
+
+                    for l in range(self.velocity_set.q):
+                        wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
                     #wp.print("stream_coarse")
 
                 loader.declare_kernel(cl_stream_coarse)
@@ -365,15 +374,16 @@ def ll_stream_coarse(loader: neon.Loader):
 
                 @wp.func
                 def cl_stream_coarse(index: typing.Any):
-                    # _missing_mask = _missing_mask_vec()
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    if _boundary_id != wp.uint8(255):
+                    if _boundary_id == wp.uint8(255):
                         return
 
                     are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
                     if are_we_a_halo_cell:
                         # HERE: we are a halo cell so we just exit
                         return
+
+
                     for l in range(self.velocity_set.q):
                         if l == 9:
                             # HERE, we skip the center direction
@@ -384,7 +394,7 @@ def cl_stream_coarse(index: typing.Any):
                                                          wp.int8(-_c[2, l]))
 
                         has_ngh_at_same_level = wp.bool(False)
-                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(55555), has_ngh_at_same_level)
+                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
 
                         # if (!pin.hasChildren(cell, dir)) {
                         if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
@@ -393,7 +403,7 @@ def cl_stream_coarse(index: typing.Any):
                                 # NO ngh. at the same level
                                 # COULD we have a ngh. at the courser level?
                                 if wp.neon_has_parent(f_0_pn, index):
-                                    # YES ghost cell on top of us
+                                    # YES halo cell on top of us
                                     has_a_courser_ngh = wp.bool(False)
                                     exploded_pop = wp.neon_lbm_read_coarser_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_courser_ngh)
                                     if has_a_courser_ngh:
@@ -405,18 +415,23 @@ def cl_stream_coarse(index: typing.Any):
                                         # -> **Explosion**
                                         wp.neon_write(f_1_pn, index, l, exploded_pop)
                         else:
-                            # HERE -> I have a finer neigh. in direction pull (opposite l)
-                            # Then I have to read from the halo on top of my finer neigh.
+                            # HERE -> I have a finer ngh. in direction pull (opposite l)
+                            # Then I have to read from the halo on top of my finer ngh.
                             if has_ngh_at_same_level:
                                 if l == 10:
                                     wp.print(accumulated)
-                                # accumulated = _w[l]
+                                    glob = wp.neon_global_idx(f_1_pn, index)
+                                    wp.neon_cuda_info()
+                                    wp.neon_print(glob)
+                                    wp.neon_level(f_1_pn)
+                                accumulated = _w[l]
                                 # Full State
                                 # YES finer ngh. in the pull direction (opposite of l)
                                 # YES ngh. at the same level
                                 # -> **Coalescence**
-                                accumulated = accumulated / self.compute_dtype(16)
+                                # accumulated = accumulated / self.compute_dtype(16)
                                 wp.neon_write(f_1_pn, index, l, accumulated)
+
                             else:
                                 wp.print("ERRRRRRORRRRRRRRRRRRRR")
 
@@ -449,27 +464,34 @@ def ll_stream_coarse(loader: neon.Loader):
 
                 @wp.func
                 def cl_stream_coarse(index: typing.Any):
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    if _boundary_id == wp.uint8(255):
+                        return
+
+                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
+                    if are_we_a_halo_cell:
+                        # HERE: we are a halo cell so we just exit
+                        return
+
                     _missing_mask = _missing_mask_vec()
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    if _boundary_id != wp.uint8(255):
-                        if not wp.neon_has_child(f_0_pn, index):
-                            # do stream normally
-                            _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
-                            _f_post_collision = _f0_thread
-                            _f_post_stream = _f1_thread
-
-
-                            # do non mres post-streaming corrections
-                            _f_post_stream = apply_bc(
-                                index, timestep,
-                                _boundary_id,
-                                _missing_mask,
-                                f_0_pn, f_1_pn,
-                                _f_post_collision, _f_post_stream, True
-                            )
+                    # do stream normally
+                    _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
+                    _f_post_collision = _f0_thread
+                    _f_post_stream = _f1_thread
 
-                            for l in range(self.velocity_set.q):
-                                wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
+
+                    # do non mres post-streaming corrections
+                    _f_post_stream = apply_bc(
+                        index, timestep,
+                        _boundary_id,
+                        _missing_mask,
+                        f_0_pn, f_1_pn,
+                        _f_post_collision, _f_post_stream, True
+                    )
+
+                    for l in range(self.velocity_set.q):
+                        wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
 
                 loader.declare_kernel(cl_stream_coarse)
 
diff --git a/xlb/operator/stepper/nse_multires_stepper_vk.py b/xlb/operator/stepper/nse_multires_stepper_vk.py
index d8c8318a..64fa668e 100644
--- a/xlb/operator/stepper/nse_multires_stepper_vk.py
+++ b/xlb/operator/stepper/nse_multires_stepper_vk.py
@@ -282,9 +282,9 @@ def cl_single_step_finest(index: typing.Any):
                                 is_valid = False
                                 value = self.compute_dtype(0)
                                 if od_or_even == 0:
-                                    value = wp.neon_uncle_read(f_1_pn, index, push_direction, opposite_l, value, is_valid)
+                                    value = wp.neon_read_uncle(f_1_pn, index, push_direction, opposite_l, value, is_valid)
                                 else:
-                                    value = wp.neon_uncle_read(f_0_pn, index, push_direction, opposite_l, value, is_valid)
+                                    value = wp.neon_read_uncle(f_0_pn, index, push_direction, opposite_l, value, is_valid)
                                 if is_valid:
                                     wp.neon_write(f_1_pn, index, l, _f_post_stream[l], value)
 
@@ -327,7 +327,7 @@ def cl_collide_coarse(index: typing.Any):
                     if _boundary_id == wp.uint8(255):
                         return
 
-                    if not wp.neon_has_children(f_0_pn, index):
+                    if not wp.neon_has_child(f_0_pn, index):
 
                         # Read thread data for populations, these are post streaming
                         _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
@@ -393,7 +393,7 @@ def cl_stream_coarse(index: typing.Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id != wp.uint8(255):
                         #  if (!pin.hasChildren(cell)) {
-                        if not wp.neon_has_children(f_0_pn, index):
+                        if not wp.neon_has_child(f_0_pn, index):
                             # do stream normally
                             _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
                             _f_post_stream = self.stream.neon_functional(f_0_pn, index)
@@ -404,21 +404,21 @@ def cl_stream_coarse(index: typing.Any):
                                 _missing_mask[l] = wp.neon_read(missing_mask_pn, index, l)
 
                                 #  if (!pin.hasChildren(cell, dir)) {
-                                if not wp.neon_has_children(f_0_pn, index, pull_direction):
+                                if not wp.neon_has_child(f_0_pn, index, pull_direction):
                                     #if (nghType.mIsValid) {
                                     # NOTHING as taken  care after
                                     # } else if (pin.hasParent(cell) && !(dir.x == 0 && dir.y == 0 && dir.z == 0)) {
                                     if wp.neon_has_parent(f_0_pn, index):
                                         if pull_direction.x != 0 or pull_direction.y != 0 or pull_direction.z != 0:
                                             # is_valid = wp.bool(False)
-                                            # uncle_val = wp.neon_uncle_read(f_0_pn, index, pull_direction, l, self.compute_dtype(0), is_valid)
+                                            # uncle_val = wp.neon_read_uncle(f_0_pn, index, pull_direction, l, self.compute_dtype(0), is_valid)
                                             # if is_valid:
                                             #     #_f_post_stream[l] = uncle_val
                                             #     # HERE DB
                                             _f_post_stream[l] =  self.compute_dtype(0.0)
                                 else:
                                     is_valid = wp.bool(False)
-                                    read_accumulate_date = wp.neon_ngh_data(f_1_pn, index, pull_direction, l, self.compute_dtype(0),is_valid)
+                                    read_accumulate_date = wp.neon_read_ngh(f_1_pn, index, pull_direction, l, self.compute_dtype(0),is_valid)
                                     if is_valid:
                                         #_f_post_stream[l] = read_accumulate_date * self.compute_dtype(0.5)
                                         # HERE DB
diff --git a/xlb/velocity_set/d3q19.py b/xlb/velocity_set/d3q19.py
index 4a48c2f0..f30af6f8 100644
--- a/xlb/velocity_set/d3q19.py
+++ b/xlb/velocity_set/d3q19.py
@@ -17,8 +17,10 @@ class D3Q19(VelocitySet):
     def __init__(self, precision_policy, backend):
         # Construct the velocity vectors and weights
         c = np.array([ci for ci in itertools.product([-1, 0, 1], repeat=3) if np.sum(np.abs(ci)) <= 2]).T
+
         w = np.zeros(19)
         for i in range(19):
+            print(f"{i} -> c[:, i] = {c[:, i]}")
             if np.sum(np.abs(c[:, i])) == 0:
                 w[i] = 1.0 / 3.0
             elif np.sum(np.abs(c[:, i])) == 1:

From e0f714610d8d2bd2b377ac434240972b757646dd Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Mon, 5 May 2025 09:28:51 +0200
Subject: [PATCH 025/208] Debugging

---
 xlb/helper/nse_multires_solver.py | 73 +------------------------------
 1 file changed, 1 insertion(+), 72 deletions(-)

diff --git a/xlb/helper/nse_multires_solver.py b/xlb/helper/nse_multires_solver.py
index d821e7d1..f6a7feb7 100644
--- a/xlb/helper/nse_multires_solver.py
+++ b/xlb/helper/nse_multires_solver.py
@@ -174,77 +174,6 @@ def recurtion(level):
 
         recurtion(self.count_levels-1)
 
-        # # op_name, mres_level, f_0, f_1, bc_mask, missing_mask,  omega, timestep
-        # ## LEVEL 1
-        # if  self.count_levels == 2:
-        #     self.stepper.launch_container(
-        #     streamId= 0,
-        #     op_name="collide_coarse",
-        #     mres_level=1,
-        #     f_0=self.f_0,
-        #     f_1=self.f_1,
-        #     bc_mask=self.bc_mask,
-        #     missing_mask=self.missing_mask,
-        #     omega=self.omega,
-        #     timestep=iteration_id,
-        #     )
-        # ## LEVEL 0
-        # self.stepper.launch_container(
-        #     streamId= 0,
-        #     op_name="collide_coarse",
-        #     mres_level=0,
-        #     f_0=self.f_0,
-        #     f_1=self.f_1,
-        #     bc_mask=self.bc_mask,
-        #     missing_mask=self.missing_mask,
-        #     omega=self.omega,
-        #     timestep=iteration_id,
-        # )
-        # self.stepper.launch_container(
-        #                     streamId= 0,
-        #     op_name="stream_coarse",
-        #     mres_level=0,
-        #     f_0=self.f_1,
-        #     f_1=self.f_0,
-        #     bc_mask=self.bc_mask,
-        #     missing_mask=self.missing_mask,
-        #     omega=self.omega,
-        #     timestep=iteration_id,
-        # )
-        # self.stepper.launch_container(
-        #                     streamId= 0,
-        #     op_name="collide_coarse",
-        #     mres_level=0,
-        #     f_0=self.f_0,
-        #     f_1=self.f_1,
-        #     bc_mask=self.bc_mask,
-        #     missing_mask=self.missing_mask,
-        #     omega=self.omega,
-        #     timestep=iteration_id,
-        # )
-        # self.stepper.launch_container(
-        #     streamId= 0,
-        #     op_name="stream_coarse",
-        #     mres_level=0,
-        #     f_0=self.f_1,
-        #     f_1=self.f_0,
-        #     bc_mask=self.bc_mask,
-        #     missing_mask=self.missing_mask,
-        #     omega=self.omega,
-        #     timestep=iteration_id,
-        # )
-        # # LEVEL 0
-        # if  self.count_levels == 2:
-        #     self.stepper.launch_container(
-        #         streamId= 0,
-        #         op_name="stream_coarse",
-        #         mres_level=1,
-        #         f_0=self.f_1,
-        #         f_1=self.f_0,
-        #         bc_mask=self.bc_mask,
-        #         missing_mask=self.missing_mask,
-        #         omega=self.omega,
-        #         timestep=iteration_id,
-        #     )
+
 
 

From 556d32e7459906744f6bd3d90a4cfbcdb2e7ac8b Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Mon, 5 May 2025 15:50:41 +0200
Subject: [PATCH 026/208] LDC

---
 .../performance/mlups_3d_multires_solver.py   |  12 +-
 xlb/helper/nse_multires_solver.py             |  26 ++-
 xlb/operator/stepper/nse_multires_stepper.py  | 175 +++++++++++++-----
 3 files changed, 149 insertions(+), 64 deletions(-)

diff --git a/examples/performance/mlups_3d_multires_solver.py b/examples/performance/mlups_3d_multires_solver.py
index 601e9491..1dee395a 100644
--- a/examples/performance/mlups_3d_multires_solver.py
+++ b/examples/performance/mlups_3d_multires_solver.py
@@ -94,7 +94,7 @@ def peel(dim, idx, peel_level, outwards):
             for k in range(dim.z):
                 idx = neon.Index_3d(i,j,k)
                 val = 0
-                if peel(dim, idx, 4, True):
+                if peel(dim, idx, dim.x/9, True):
                     val = 1
                 level_zero_mask[i, j, k] = val
 
@@ -105,9 +105,7 @@ def peel(dim, idx, peel_level, outwards):
         for j in range(m.x):
             for k in range(m.x):
                 idx = neon.Index_3d(i,j,k)
-                val = 0
-                if peel(dim, idx, dim.x, True) and peel(dim, idx, 3, False):
-                    val = 1
+                val = 1
                 level_one_mask[i, j, k] = val
 
     level_one_mask = np.ascontiguousarray(level_one_mask, dtype=np.int32)
@@ -131,11 +129,13 @@ def peel(dim, idx, peel_level, outwards):
     # Create stepper
     stepper = MultiresIncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="BGK")
 
-    Re = 100.0
+    Re = 1000.0
 
     clength = grid_shape[0] - 1
     visc = prescribed_vel * clength / Re
     omega = 1.0 / (3.0 * visc + 0.5)
+    omega = 1.0
+
 
     # # Initialize fields and run simulation
     # omega = 1.0
@@ -150,7 +150,7 @@ def peel(dim, idx, peel_level, outwards):
     for i in range(num_steps):
         print(f"step {i}")
         sim.step()
-        if i%1 == 0:
+        if i%10 == 0:
             sim.export_macroscopic("u_lid_driven_cavity_")
     wp.synchronize()
     t = time.time() - start_time
diff --git a/xlb/helper/nse_multires_solver.py b/xlb/helper/nse_multires_solver.py
index f6a7feb7..f8c66f38 100644
--- a/xlb/helper/nse_multires_solver.py
+++ b/xlb/helper/nse_multires_solver.py
@@ -18,21 +18,29 @@ def __init__(self, grid, velocity_set, stepper, omega):
         # Create fields
         self.rho = grid.create_field(cardinality=1, dtype=self.precision_policy.store_precision)
         self.u = grid.create_field(cardinality=3, dtype=self.precision_policy.store_precision)
+        self.coalescence_factor = grid.create_field(cardinality=velocity_set.q, dtype=self.precision_policy.store_precision)
+
         fname_prefix='test'
 
         for level in range(self.count_levels):
             self.u.fill_run(level, 0.0, 0)
             self.rho.fill_run(level, 1.0, 0)
-        wp.synchronize()
-        self.u.update_host(0)
-        wp.synchronize()
-        self.u.export_vti(f"u_{fname_prefix}_topology.vti", 'u')
+            self.coalescence_factor.fill_run(level, 0.0, 0)
+
+
+
+        #wp.synchronize()
+        #self.u.update_host(0)
+        #wp.synchronize()
+        #self.u.export_vti(f"u_{fname_prefix}_topology.vti", 'u')
 
         self.f_0, self.f_1, self.bc_mask, self.missing_mask = stepper.prepare_fields(rho=self.rho,u=self.u)
-        wp.synchronize()
-        self.u.update_host(0)
-        wp.synchronize()
-        self.u.export_vti(f"u_t2_{fname_prefix}_topology.vti", 'u')
+        stepper.prepare_coalescence_count(coalescence_factor=self.coalescence_factor, bc_mask=self.bc_mask)
+
+        #wp.synchronize()
+        #self.u.update_host(0)
+        #wp.synchronize()
+        #self.u.export_vti(f"u_t2_{fname_prefix}_topology.vti", 'u')
 
         self.odd_step = None
         self.even_step = None
@@ -138,7 +146,7 @@ def recurtion(level):
                 f_1=self.f_0,
                 bc_mask=self.bc_mask,
                 missing_mask=self.missing_mask,
-                omega=self.omega,
+                omega = self.coalescence_factor,
                 timestep=iteration_id,
             )
 
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index a4301a81..52e5f7ed 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -81,7 +81,7 @@ def prepare_fields(self, rho, u, initializer=None):
             f_1.copy_from_run(level, f_0, 0)
         f_0.update_host(0)
         wp.synchronize()
-        f_0.export_vti("f0_eq_init.vti", "init_f0")
+        #f_0.export_vti("f0_eq_init.vti", "init_f0")
 
         # Process boundary conditions and update masks
         bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, bc_mask, missing_mask, xlb_grid=self.grid)
@@ -92,12 +92,122 @@ def prepare_fields(self, rho, u, initializer=None):
         f_0.update_host(0)
         wp.synchronize()
         bc_mask.export_vti("bc_mask.vti", 'bc_mask')
-        f_0.export_vti("init_f0.vti", 'init_f0')
-
+        #f_0.export_vti("init_f0.vti", 'init_f0')
         #missing_mask.export_vti("missing_mask.vti", 'missing_mask')
 
         return f_0, f_1, bc_mask, missing_mask
 
+    def prepare_coalescence_count(self, coalescence_factor, bc_mask):
+        num_levels = coalescence_factor.get_grid().get_num_levels()
+
+        @neon.Container.factory(name="sum_kernel_by_level")
+        def sum_kernel_by_level(level):
+            def ll_coalescence_count(loader: neon.Loader):
+                loader.set_mres_grid(coalescence_factor.get_grid(), level)
+
+                coalescence_factor_pn = loader.get_mres_read_handle(coalescence_factor)
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask)
+
+                _c = self.velocity_set.c
+                _w = self.velocity_set.w
+                import typing
+                @wp.func
+                def cl_collide_coarse(index: typing.Any):
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    if _boundary_id == wp.uint8(255):
+                        return
+                    if not wp.neon_has_child(coalescence_factor_pn, index):
+                        for l in range(self.velocity_set.q):
+                            if level < num_levels - 1:
+                                push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]),
+                                                                 wp.int8(_c[1, l]),
+                                                                 wp.int8(_c[2, l]))
+                                val = self.compute_dtype(1)
+                                wp.neon_mres_lbm_store_op(coalescence_factor_pn, index, l, push_direction, val)
+                loader.declare_kernel(cl_collide_coarse)
+            return ll_coalescence_count
+
+        for level in range(num_levels):
+            sum_kernel = sum_kernel_by_level(level)
+            sum_kernel.run(0)
+
+        @neon.Container.factory(name="sum_kernel_by_level")
+        def invert_count(level):
+            def loading(loader: neon.Loader):
+                loader.set_mres_grid(coalescence_factor.get_grid(), level)
+
+                coalescence_factor_pn = loader.get_mres_read_handle(coalescence_factor)
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask)
+
+                _c = self.velocity_set.c
+                _w = self.velocity_set.w
+                import typing
+                @wp.func
+                def compute(index: typing.Any):
+                    # _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    # if _boundary_id == wp.uint8(255):
+                    #     return
+                    # for l in range(self.velocity_set.q):
+                    #     val = wp.neon_read(coalescence_factor_pn, index, l)
+                    #     if val > 0:
+                    #         val = self.compute_dtype(1) / val
+                    #     wp.neon_write(coalescence_factor_pn, index, l, val)
+                    #####
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    if _boundary_id == wp.uint8(255):
+                        return
+
+                    are_we_a_halo_cell = wp.neon_has_child(coalescence_factor_pn, index)
+                    if are_we_a_halo_cell:
+                        # HERE: we are a halo cell so we just exit
+                        return
+
+
+                    for l in range(self.velocity_set.q):
+                        if l == 9:
+                            # HERE, we skip the center direction
+                            continue
+
+                        pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]),
+                                                         wp.int8(-_c[1, l]),
+                                                         wp.int8(-_c[2, l]))
+
+                        has_ngh_at_same_level = wp.bool(False)
+                        coalescence_factor = wp.neon_read_ngh(coalescence_factor_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
+
+                        # if (!pin.hasChildren(cell, dir)) {
+                        if not wp.neon_has_finer_ngh(coalescence_factor_pn, index, pull_direction):
+                            pass
+                        else:
+                            # HERE -> I have a finer ngh. in direction pull (opposite l)
+                            # Then I have to read from the halo on top of my finer ngh.
+                            if has_ngh_at_same_level:
+                                # if l == 10:
+                                #     wp.print(accumulated)
+                                #     glob = wp.neon_global_idx(f_1_pn, index)
+                                #     wp.neon_cuda_info()
+                                #     wp.neon_print(glob)
+                                #     wp.neon_level(f_1_pn)
+                                # accumulated = _w[l]
+                                # Full State
+                                # YES finer ngh. in the pull direction (opposite of l)
+                                # YES ngh. at the same level
+                                # -> **Coalescence**
+                                if coalescence_factor > self.compute_dtype(0):
+                                    coalescence_factor = self.compute_dtype(1)/( self.compute_dtype(2)*coalescence_factor)
+                                    wp.neon_write(coalescence_factor_pn, index, l, coalescence_factor)
+
+                            else:
+                                wp.print("ERRRRRRORRRRRRRRRRRRRR")
+
+                loader.declare_kernel(compute)
+            return loading
+
+        for level in range(num_levels):
+            sum_kernel = invert_count(level)
+            sum_kernel.run(0)
+        return
+
     @classmethod
     def _process_boundary_conditions(cls, boundary_conditions, bc_mask, missing_mask, xlb_grid=None):
         """Process boundary conditions and update boundary masks."""
@@ -264,13 +374,7 @@ def cl_collide_coarse(index: typing.Any):
                                                              wp.int8(_c[1, l]),
                                                              wp.int8(_c[2, l]))
                             if(level < num_levels - 1):
-                                ## Store
-                                # if even_itertation == 0:
-                                #     wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, _f_post_collision[l])
-                                # else:
                                 val = _f_post_collision[l]
-                                val = self.compute_dtype(11)
-                                #val = self.compute_dtype(1.0)
                                 wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, val)
                                 wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, val)
 
@@ -280,12 +384,10 @@ def cl_collide_coarse(index: typing.Any):
                             wp.neon_write(f_1_pn, index, l, self.compute_dtype(0))
                             wp.neon_write(f_0_pn, index, l, self.compute_dtype(0))
 
-                    #wp.print("stream_coarse")
-
                 loader.declare_kernel(cl_collide_coarse)
             return ll_collide_coarse
 
-        @neon.Container.factory(name="stream_coarse")
+        @neon.Container.factory(name="stream_coarse_step_A")
         def stream_coarse_step_A(
             level: int,
             f_0_fd: Any,
@@ -342,7 +444,7 @@ def cl_stream_coarse(index: typing.Any):
 
             return ll_stream_coarse
 
-        @neon.Container.factory(name="stream_coarse")
+        @neon.Container.factory(name="stream_coarse_step_B")
         def stream_coarse_step_B(
             level: int,
             f_0_fd: Any,
@@ -352,22 +454,15 @@ def stream_coarse_step_B(
             omega: Any,
             timestep: int,
         ):
-            num_levels = f_0_fd.get_grid().get_num_levels()
-            # if level != 0:
-            #     # throw an exception
-            #     raise Exception("Only the finest level is supported for now")
-
-            # module op to define odd of even iteration
-            #od_or_even = wp.module("odd_or_even", "even")
-
             def ll_stream_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
-
+                coalescence_factor_fd = omega
                 f_0_pn = loader.get_mres_read_handle(f_0_fd)
                 f_1_pn = loader.get_mres_write_handle(f_1_fd)
 
                 bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
                 missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
+                coalescence_factor_pn  = loader.get_mres_read_handle(coalescence_factor_fd)
 
                 _c = self.velocity_set.c
                 _w = self.velocity_set.w
@@ -418,18 +513,19 @@ def cl_stream_coarse(index: typing.Any):
                             # HERE -> I have a finer ngh. in direction pull (opposite l)
                             # Then I have to read from the halo on top of my finer ngh.
                             if has_ngh_at_same_level:
-                                if l == 10:
-                                    wp.print(accumulated)
-                                    glob = wp.neon_global_idx(f_1_pn, index)
-                                    wp.neon_cuda_info()
-                                    wp.neon_print(glob)
-                                    wp.neon_level(f_1_pn)
-                                accumulated = _w[l]
+                                # if l == 10:
+                                #     wp.print(accumulated)
+                                #     glob = wp.neon_global_idx(f_1_pn, index)
+                                #     wp.neon_cuda_info()
+                                #     wp.neon_print(glob)
+                                #     wp.neon_level(f_1_pn)
+                                # accumulated = _w[l]
                                 # Full State
                                 # YES finer ngh. in the pull direction (opposite of l)
                                 # YES ngh. at the same level
                                 # -> **Coalescence**
-                                # accumulated = accumulated / self.compute_dtype(16)
+                                coalescence_factor = wp.neon_read(coalescence_factor_pn, index, l)
+                                accumulated = accumulated *coalescence_factor
                                 wp.neon_write(f_1_pn, index, l, accumulated)
 
                             else:
@@ -521,25 +617,6 @@ def launch_container(self, streamId, op_name, mres_level, f_0, f_1, bc_mask, mis
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_launch(self, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
-        #if self.c is None:
-        #    self.c = self.neon_container(f_0, f_1, bc_mask, missing_mask, timestep)
-        # c = None
-        # if self.odd_or_even == 'even':
-        #     c = self.c_even
-        # else:
-        #     c = self.c_odd
-        #
-        # if c is None:
-        #     pass
         c = self.neon_container(f_0, f_1, bc_mask, missing_mask, omega, timestep)
         c.run(0)
-        #
-        # if self.odd_or_even == 'even':
-        #     c = self.c_even
-        # else:
-        #     c = self.c_odd
-        #
-        # if self.odd_or_even == 'even':
-        #     self.odd_or_even = 'odd'
-
         return f_0, f_1

From 7cbc16fb481092e0d0a9b09061505fa56276bb62 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Mon, 5 May 2025 20:47:02 +0200
Subject: [PATCH 027/208] WIP

---
 .../3_levels_mlups_3d_multires_solver.py      | 219 ++++++++++++++++++
 1 file changed, 219 insertions(+)
 create mode 100644 examples/performance/3_levels_mlups_3d_multires_solver.py

diff --git a/examples/performance/3_levels_mlups_3d_multires_solver.py b/examples/performance/3_levels_mlups_3d_multires_solver.py
new file mode 100644
index 00000000..880b52bf
--- /dev/null
+++ b/examples/performance/3_levels_mlups_3d_multires_solver.py
@@ -0,0 +1,219 @@
+import xlb
+import argparse
+import time
+import warp as wp
+import numpy as np
+
+# add a directory to the PYTHON PATH
+import sys
+# sys.path.append('/home/max/repos/neon/warping/neon_warp_testing/neon_py_bindings/py/')
+import neon
+
+from xlb.compute_backend import ComputeBackend
+from xlb.precision_policy import PrecisionPolicy
+from xlb.grid import multires_grid_factory
+from xlb.operator.stepper import MultiresIncompressibleNavierStokesStepper
+from xlb.operator.boundary_condition import FullwayBounceBackBC, EquilibriumBC
+from xlb.distribute import distribute
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation (BGK)")
+    # Positional arguments
+    parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid")
+    parser.add_argument("num_steps", type=int, help="Timestep for the simulation")
+    parser.add_argument("backend", type=str, help="Backend for the simulation (jax, warp or neon)")
+    parser.add_argument("precision", type=str, help="Precision for the simulation (e.g., fp32/fp32)")
+
+    # Optional arguments
+    parser.add_argument("--num_devices", type=int, default=0, help="Number of devices for the simulation (default: 0)")
+    parser.add_argument("--velocity_set", type=str, default='D3Q19',
+                        help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)"
+                        )
+
+    return parser.parse_args()
+
+
+def setup_simulation(args):
+    backend = None
+    if args.backend == "jax": backend = ComputeBackend.JAX
+    elif args.backend == "warp": backend = ComputeBackend.WARP
+    elif args.backend == "neon": backend = ComputeBackend.NEON
+    if backend is None:
+        raise ValueError("Invalid backend")
+
+    precision_policy_map = {
+        "fp32/fp32": PrecisionPolicy.FP32FP32,
+        "fp64/fp64": PrecisionPolicy.FP64FP64,
+        "fp64/fp32": PrecisionPolicy.FP64FP32,
+        "fp32/fp16": PrecisionPolicy.FP32FP16,
+    }
+    precision_policy = precision_policy_map.get(args.precision)
+    if precision_policy is None:
+        raise ValueError("Invalid precision")
+
+    velocity_set = None
+    if args.velocity_set == 'D3Q19': velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
+    elif args.velocity_set == 'D3Q27': velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, backend=backend)
+    if velocity_set is None:
+        raise ValueError("Invalid velocity set")
+
+    xlb.init(
+        velocity_set=velocity_set,
+        default_backend=backend,
+        default_precision_policy=precision_policy,
+    )
+
+    return backend, precision_policy
+
+
+def run(backend, precision_policy, grid_shape, num_steps):
+    # Create grid and setup boundary conditions
+    velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
+
+    def peel(dim, idx, peel_level, outwards):
+        if outwards:
+            xIn =  idx.x <= peel_level or idx.x >= dim.x -1 -peel_level
+            yIn =  idx.y <= peel_level or idx.y >= dim.y -1 -peel_level
+            zIn =  idx.z <= peel_level or idx.z >= dim.z -1 - peel_level
+            return xIn or yIn or zIn
+        else:
+            xIn = idx.x >= peel_level and idx.x <= dim.x - 1 - peel_level
+            yIn = idx.y >= peel_level and idx.y <= dim.y - 1 - peel_level
+            zIn = idx.z >= peel_level and idx.z <= dim.z - 1 - peel_level
+            return xIn and yIn and zIn
+
+
+    dim = neon.Index_3d(grid_shape[0],
+                        grid_shape[1],
+                        grid_shape[2])
+
+    def get_peeled_np(level, width):
+        divider = 2**level
+        m = neon.Index_3d(dim.x // divider , dim.y // divider, dim.z // divider)
+        if level == 0:
+            m = dim
+
+        mask = np.zeros((m.x, m.y, m.z), dtype=int)
+        mask = np.ascontiguousarray(mask, dtype=np.int32)
+        # loop over all the elements in mask and set to one any that have x=0 or y=0 or z=0
+        for i in range(m.x):
+            for j in range(m.y):
+                for k in range(m.z):
+                    idx = neon.Index_3d(i, j, k)
+                    val = 0
+                    if peel(m, idx, m.x / width, True):
+                        val = 1
+                    mask[i, j, k] = val
+        return mask
+
+    levels = []
+
+    l0 = get_peeled_np(0, 17)
+    l1 = get_peeled_np(1, 7)
+    l2 = get_peeled_np(2, 4)
+
+    lastLevel = 3
+    divider = 2**lastLevel
+    m = neon.Index_3d(dim.x // divider +1, dim.y // divider+1, dim.z // divider+1)
+    lastLevel = np.ones((m.x, m.y, m.z), dtype=int)
+    lastLevel = np.ascontiguousarray(lastLevel, dtype=np.int32)
+
+    levels = [l0, l1, l2, lastLevel]
+
+    grid = multires_grid_factory(grid_shape, velocity_set=velocity_set,
+                                 sparsity_pattern_list=levels,
+                                 sparsity_pattern_origins=[ neon.Index_3d(0, 0, 0)]*len(levels),)
+
+    box = grid.bounding_box_indices()
+    box_no_edge = grid.bounding_box_indices(remove_edges=True)
+    lid = box_no_edge["top"]
+    walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in range(len(grid.shape))]
+    walls = np.unique(np.array(walls), axis=-1).tolist()
+
+    prescribed_vel = 0.05
+
+    boundary_conditions = [EquilibriumBC(rho=1.0, u=(prescribed_vel, 0.0, 0.0), indices=lid),
+                           EquilibriumBC(rho=1.0, u=(0.0, 0.0, 0.0), indices=walls)]
+
+    # Create stepper
+    stepper = MultiresIncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="BGK")
+
+    Re = 1000.0
+
+    clength = grid_shape[0] - 1
+    visc = prescribed_vel * clength / Re
+    omega = 1.0 / (3.0 * visc + 0.5)
+    omega = 1.0
+
+
+    # # Initialize fields and run simulation
+    # omega = 1.0
+
+    sim = xlb.helper.Nse_multires_simulation(grid, velocity_set, stepper, omega)
+
+    sim.export_macroscopic("Initial_")
+
+    print("start timing")
+    start_time = time.time()
+
+    for i in range(num_steps):
+        print(f"step {i}")
+        sim.step()
+        if i%10 == 0:
+            sim.export_macroscopic("u_lid_driven_cavity_")
+    wp.synchronize()
+    t = time.time() - start_time
+
+    sim.export_macroscopic("u_lid_driven_cavity_")
+    return t
+
+
+def calculate_mlups(cube_edge, num_steps, elapsed_time):
+    total_lattice_updates = cube_edge**3 * num_steps
+    mlups = (total_lattice_updates / elapsed_time) / 1e6
+    return mlups
+
+def post_process(macro, rho, u, f_0,  i):
+    # Write the results. We'll use JAX backend for the post-processing
+    # import jax.numpy as jnp
+    # if not isinstance(f_0, jnp.ndarray):
+    #     # If the backend is warp, we need to drop the last dimension added by warp for 2D simulations
+    #     f_0 = wp.to_jax(f_0)[..., 0]
+    # else:
+    #     f_0 = f_0
+    rho, u = macro(f_0, rho, u )
+    wp.synchronize()
+    u.update_host(0)
+    rho.update_host(0)
+    wp.synchronize()
+    u.export_vti(f"u_lid_driven_cavity_{i}.vti", 'u')
+    rho.export_vti(f"rho_lid_driven_cavity_{i}.vti", 'rho')
+
+    pass
+
+    # # remove boundary cells
+    # rho = rho[:, 1:-1, 1:-1, 1:-1]
+    # u = u[:, 1:-1, 1:-1, 1:-1]
+    # u_magnitude = (u[0] ** 2 + u[1] ** 2) ** 0.5
+    #
+    # fields = {"rho": rho[0], "u_x": u[0], "u_y": u[1], "u_magnitude": u_magnitude}
+    #
+    # # save_fields_vtk(fields, timestep=i, prefix="lid_driven_cavity")
+    # ny=fields["u_magnitude"].shape[1]
+    # from xlb.utils import  save_image
+    # save_image(fields["u_magnitude"][:, ny//2, :], timestep=i, prefix="lid_driven_cavity")
+
+def main():
+
+    args = parse_arguments()
+    backend, precision_policy = setup_simulation(args)
+    grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
+    elapsed_time = run(backend, precision_policy, grid_shape, args.num_steps)
+    mlups = calculate_mlups(args.cube_edge, args.num_steps, elapsed_time)
+
+    print(f"Simulation completed in {elapsed_time:.2f} seconds")
+    print(f"MLUPs: {mlups:.2f}")
+
+
+if __name__ == "__main__":
+    main()

From 05a86e3fca61fce0ce9487eb5fba54fe33d98436 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 15 May 2025 01:56:37 +0200
Subject: [PATCH 028/208] Improving GPU utilization.

---
 .../3_levels_mlups_3d_multires_solver.py      |  35 +---
 xlb/grid/multires_grid.py                     |   3 +
 xlb/helper/nse_multires_solver.py             |  70 +++----
 xlb/operator/stepper/nse_multires_stepper.py  | 181 ++++++++++--------
 .../stepper/nse_multires_stepper_vk.py        |   5 +-
 5 files changed, 156 insertions(+), 138 deletions(-)

diff --git a/examples/performance/3_levels_mlups_3d_multires_solver.py b/examples/performance/3_levels_mlups_3d_multires_solver.py
index 880b52bf..c7cd9ba3 100644
--- a/examples/performance/3_levels_mlups_3d_multires_solver.py
+++ b/examples/performance/3_levels_mlups_3d_multires_solver.py
@@ -130,7 +130,7 @@ def get_peeled_np(level, width):
     walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in range(len(grid.shape))]
     walls = np.unique(np.array(walls), axis=-1).tolist()
 
-    prescribed_vel = 0.05
+    prescribed_vel = 0.1
 
     boundary_conditions = [EquilibriumBC(rho=1.0, u=(prescribed_vel, 0.0, 0.0), indices=lid),
                            EquilibriumBC(rho=1.0, u=(0.0, 0.0, 0.0), indices=walls)]
@@ -138,12 +138,12 @@ def get_peeled_np(level, width):
     # Create stepper
     stepper = MultiresIncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="BGK")
 
-    Re = 1000.0
+    Re = 5000.0
 
     clength = grid_shape[0] - 1
     visc = prescribed_vel * clength / Re
     omega = 1.0 / (3.0 * visc + 0.5)
-    omega = 1.0
+    #omega = 1.0
 
 
     # # Initialize fields and run simulation
@@ -152,16 +152,17 @@ def get_peeled_np(level, width):
     sim = xlb.helper.Nse_multires_simulation(grid, velocity_set, stepper, omega)
 
     sim.export_macroscopic("Initial_")
+    sim.step()
 
     print("start timing")
+    wp.synchronize()
     start_time = time.time()
 
     for i in range(num_steps):
-        print(f"step {i}")
         sim.step()
-        if i%10 == 0:
-            sim.export_macroscopic("u_lid_driven_cavity_")
-    wp.synchronize()
+        if i%100 == 0:
+            print(f"step {i}")
+        #    sim.export_macroscopic("u_lid_driven_cavity_")
     t = time.time() - start_time
 
     sim.export_macroscopic("u_lid_driven_cavity_")
@@ -169,27 +170,11 @@ def get_peeled_np(level, width):
 
 
 def calculate_mlups(cube_edge, num_steps, elapsed_time):
-    total_lattice_updates = cube_edge**3 * num_steps
+    num_step_finer = num_steps * 2**(4-1)
+    total_lattice_updates = cube_edge**3 * num_step_finer
     mlups = (total_lattice_updates / elapsed_time) / 1e6
     return mlups
 
-def post_process(macro, rho, u, f_0,  i):
-    # Write the results. We'll use JAX backend for the post-processing
-    # import jax.numpy as jnp
-    # if not isinstance(f_0, jnp.ndarray):
-    #     # If the backend is warp, we need to drop the last dimension added by warp for 2D simulations
-    #     f_0 = wp.to_jax(f_0)[..., 0]
-    # else:
-    #     f_0 = f_0
-    rho, u = macro(f_0, rho, u )
-    wp.synchronize()
-    u.update_host(0)
-    rho.update_host(0)
-    wp.synchronize()
-    u.export_vti(f"u_lid_driven_cavity_{i}.vti", 'u')
-    rho.export_vti(f"rho_lid_driven_cavity_{i}.vti", 'rho')
-
-    pass
 
     # # remove boundary cells
     # rho = rho[:, 1:-1, 1:-1, 1:-1]
diff --git a/xlb/grid/multires_grid.py b/xlb/grid/multires_grid.py
index 5803e027..25c408de 100644
--- a/xlb/grid/multires_grid.py
+++ b/xlb/grid/multires_grid.py
@@ -92,6 +92,9 @@ def create_field(
                 field.fill_run(level= l, value=fill_value,stream_idx = 0)
         return field
 
+    def get_neon_backend(self):
+        return self.bk
+
     def _create_warp_field(self,
                            cardinality: int,
                            dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None,
diff --git a/xlb/helper/nse_multires_solver.py b/xlb/helper/nse_multires_solver.py
index f8c66f38..e377572e 100644
--- a/xlb/helper/nse_multires_solver.py
+++ b/xlb/helper/nse_multires_solver.py
@@ -7,6 +7,7 @@
 import neon
 import warp as wp
 
+
 class Nse_multires_simulation:
     def __init__(self, grid, velocity_set, stepper, omega):
         self.stepper = stepper
@@ -18,29 +19,28 @@ def __init__(self, grid, velocity_set, stepper, omega):
         # Create fields
         self.rho = grid.create_field(cardinality=1, dtype=self.precision_policy.store_precision)
         self.u = grid.create_field(cardinality=3, dtype=self.precision_policy.store_precision)
-        self.coalescence_factor = grid.create_field(cardinality=velocity_set.q, dtype=self.precision_policy.store_precision)
+        self.coalescence_factor = grid.create_field(cardinality=velocity_set.q,
+                                                    dtype=self.precision_policy.store_precision)
 
-        fname_prefix='test'
+        fname_prefix = 'test'
 
         for level in range(self.count_levels):
             self.u.fill_run(level, 0.0, 0)
             self.rho.fill_run(level, 1.0, 0)
             self.coalescence_factor.fill_run(level, 0.0, 0)
 
+        # wp.synchronize()
+        # self.u.update_host(0)
+        # wp.synchronize()
+        # self.u.export_vti(f"u_{fname_prefix}_topology.vti", 'u')
 
-
-        #wp.synchronize()
-        #self.u.update_host(0)
-        #wp.synchronize()
-        #self.u.export_vti(f"u_{fname_prefix}_topology.vti", 'u')
-
-        self.f_0, self.f_1, self.bc_mask, self.missing_mask = stepper.prepare_fields(rho=self.rho,u=self.u)
+        self.f_0, self.f_1, self.bc_mask, self.missing_mask = stepper.prepare_fields(rho=self.rho, u=self.u)
         stepper.prepare_coalescence_count(coalescence_factor=self.coalescence_factor, bc_mask=self.bc_mask)
 
-        #wp.synchronize()
-        #self.u.update_host(0)
-        #wp.synchronize()
-        #self.u.export_vti(f"u_t2_{fname_prefix}_topology.vti", 'u')
+        # wp.synchronize()
+        # self.u.update_host(0)
+        # wp.synchronize()
+        # self.u.export_vti(f"u_t2_{fname_prefix}_topology.vti", 'u')
 
         self.odd_step = None
         self.even_step = None
@@ -54,6 +54,7 @@ def __init__(self, grid, velocity_set, stepper, omega):
         )
 
         self.__init_containers(self.count_levels)
+        self._step_init()
 
     def __init_containers(self, num_levels):
         # working only with level 0 for now
@@ -77,7 +78,7 @@ def __init_containers(self, num_levels):
 
     def export_macroscopic(self, fname_prefix):
         print(f"exporting macroscopic: #levels {self.grid.count_levels}")
-        self.macro.launch_container(streamId = 0, f_0 = self.f_0, bc_mask = self.bc_mask, rho = self.rho, u = self.u)
+        self.macro.launch_container(streamId=0, f_0=self.f_0, bc_mask=self.bc_mask, rho=self.rho, u=self.u)
 
         import warp as wp
         wp.synchronize()
@@ -88,17 +89,22 @@ def export_macroscopic(self, fname_prefix):
 
         return
 
-    # one step at the corase level
     def step(self):
+        self.iteration_idx = self.iteration_idx + 1
+        self.sk.run()
+
+    # one step at the corase level
+    def _step_init(self):
+        self.app = []
 
-        def recurtion(level):
+        def recurtion(level, app):
             if level < 0:
                 return
             print(f"RECURTION down to level {level}")
             print(f"RECURTION Level {level}, COLLIDE")
 
-            self.stepper.launch_container(
-                streamId=0,
+            self.stepper.add_to_app(
+                app=app,
                 op_name="collide_coarse",
                 mres_level=level,
                 f_0=self.f_0,
@@ -121,12 +127,12 @@ def recurtion(level):
             #     #sys.exit()
             #     pass
 
-            recurtion(level-1)
-            recurtion(level-1)
+            recurtion(level - 1, app)
+            recurtion(level - 1, app)
 
             print(f"RECURTION Level {level}, stream_coarse_step_A")
-            self.stepper.launch_container(
-                streamId=0,
+            self.stepper.add_to_app(
+                app=app,
                 op_name="stream_coarse_step_A",
                 mres_level=level,
                 f_0=self.f_1,
@@ -138,22 +144,22 @@ def recurtion(level):
             )
             print(f"RECURTION Level {level}, stream_coarse_step_B")
 
-            self.stepper.launch_container(
-                streamId=0,
+            self.stepper.add_to_app(
+                app=app,
                 op_name="stream_coarse_step_B",
                 mres_level=level,
                 f_0=self.f_1,
                 f_1=self.f_0,
                 bc_mask=self.bc_mask,
                 missing_mask=self.missing_mask,
-                omega = self.coalescence_factor,
+                omega=self.coalescence_factor,
                 timestep=iteration_id,
             )
 
             print(f"RECURTION Level {level}, stream_coarse_step_C")
 
-            self.stepper.launch_container(
-                streamId=0,
+            self.stepper.add_to_app(
+                app=app,
                 op_name="stream_coarse_step_C",
                 mres_level=level,
                 f_0=self.f_1,
@@ -176,12 +182,10 @@ def recurtion(level):
             #     sys.exit()
             #     pass
 
-
         self.iteration_idx += 1
         iteration_id = self.iteration_idx % 2
 
-        recurtion(self.count_levels-1)
-
-
-
-
+        recurtion(self.count_levels - 1, app=self.app)
+        bk = self.grid.get_neon_backend()
+        self.sk = neon.Skeleton(backend=bk)
+        self.sk.sequence("mres_nse_stepper", self.app)
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 52e5f7ed..6a6bd01c 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -26,15 +26,15 @@
 
 class MultiresIncompressibleNavierStokesStepper(Stepper):
     def __init__(
-        self,
-        grid,
-        boundary_conditions=[],
-        collision_type="BGK",
-        forcing_scheme="exact_difference",
-        force_vector=None,
+            self,
+            grid,
+            boundary_conditions=[],
+            collision_type="BGK",
+            forcing_scheme="exact_difference",
+            force_vector=None,
     ):
         super().__init__(grid, boundary_conditions)
-        self.odd_or_even='even'
+        self.odd_or_even = 'even'
         self.c_even = None
         self.c_odd = None
 
@@ -45,7 +45,8 @@ def __init__(
             self.collision = KBC(self.velocity_set, self.precision_policy, self.compute_backend)
 
         if force_vector is not None:
-            self.collision = ForcedCollision(collision_operator=self.collision, forcing_scheme=forcing_scheme, force_vector=force_vector)
+            self.collision = ForcedCollision(collision_operator=self.collision, forcing_scheme=forcing_scheme,
+                                             force_vector=force_vector)
 
         # Construct the operators
         self.stream = Stream(self.velocity_set, self.precision_policy, self.compute_backend)
@@ -75,16 +76,18 @@ def prepare_fields(self, rho, u, initializer=None):
         bc_mask = self.grid.create_field(cardinality=1, dtype=Precision.UINT8)
 
         from xlb.helper.initializers import initialize_multires_eq
-        f_0 = initialize_multires_eq(f_0, self.grid, self.velocity_set, self.precision_policy, self.compute_backend, rho=rho, u=u)
+        f_0 = initialize_multires_eq(f_0, self.grid, self.velocity_set, self.precision_policy, self.compute_backend,
+                                     rho=rho, u=u)
 
         for level in range(self.grid.count_levels):
             f_1.copy_from_run(level, f_0, 0)
         f_0.update_host(0)
         wp.synchronize()
-        #f_0.export_vti("f0_eq_init.vti", "init_f0")
+        # f_0.export_vti("f0_eq_init.vti", "init_f0")
 
         # Process boundary conditions and update masks
-        bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, bc_mask, missing_mask, xlb_grid=self.grid)
+        bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, bc_mask, missing_mask,
+                                                                  xlb_grid=self.grid)
         # Initialize auxiliary data if needed
         f_0, f_1 = self._initialize_auxiliary_data(self.boundary_conditions, f_0, f_1, bc_mask, missing_mask)
         # bc_mask.update_host(0)
@@ -92,8 +95,8 @@ def prepare_fields(self, rho, u, initializer=None):
         f_0.update_host(0)
         wp.synchronize()
         bc_mask.export_vti("bc_mask.vti", 'bc_mask')
-        #f_0.export_vti("init_f0.vti", 'init_f0')
-        #missing_mask.export_vti("missing_mask.vti", 'missing_mask')
+        # f_0.export_vti("init_f0.vti", 'init_f0')
+        # missing_mask.export_vti("missing_mask.vti", 'missing_mask')
 
         return f_0, f_1, bc_mask, missing_mask
 
@@ -124,7 +127,9 @@ def cl_collide_coarse(index: typing.Any):
                                                                  wp.int8(_c[2, l]))
                                 val = self.compute_dtype(1)
                                 wp.neon_mres_lbm_store_op(coalescence_factor_pn, index, l, push_direction, val)
+
                 loader.declare_kernel(cl_collide_coarse)
+
             return ll_coalescence_count
 
         for level in range(num_levels):
@@ -162,7 +167,6 @@ def compute(index: typing.Any):
                         # HERE: we are a halo cell so we just exit
                         return
 
-
                     for l in range(self.velocity_set.q):
                         if l == 9:
                             # HERE, we skip the center direction
@@ -173,7 +177,8 @@ def compute(index: typing.Any):
                                                          wp.int8(-_c[2, l]))
 
                         has_ngh_at_same_level = wp.bool(False)
-                        coalescence_factor = wp.neon_read_ngh(coalescence_factor_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
+                        coalescence_factor = wp.neon_read_ngh(coalescence_factor_pn, index, pull_direction, l,
+                                                              self.compute_dtype(0), has_ngh_at_same_level)
 
                         # if (!pin.hasChildren(cell, dir)) {
                         if not wp.neon_has_finer_ngh(coalescence_factor_pn, index, pull_direction):
@@ -194,13 +199,15 @@ def compute(index: typing.Any):
                                 # YES ngh. at the same level
                                 # -> **Coalescence**
                                 if coalescence_factor > self.compute_dtype(0):
-                                    coalescence_factor = self.compute_dtype(1)/( self.compute_dtype(2)*coalescence_factor)
+                                    coalescence_factor = self.compute_dtype(1) / (
+                                            self.compute_dtype(2) * coalescence_factor)
                                     wp.neon_write(coalescence_factor_pn, index, l, coalescence_factor)
 
                             else:
                                 wp.print("ERRRRRRORRRRRRRRRRRRRR")
 
                 loader.declare_kernel(compute)
+
             return loading
 
         for level in range(num_levels):
@@ -252,7 +259,7 @@ def _construct_neon(self):
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
         _missing_mask_vec = wp.vec(self.velocity_set.q, dtype=wp.uint8)
         _opp_indices = self.velocity_set.opp_indices
-        #_cast_to_store_dtype = self.store_dtype()
+        # _cast_to_store_dtype = self.store_dtype()
 
         # Read the list of bc_to_id created upon instantiation
         bc_to_id = boundary_condition_registry.bc_to_id
@@ -268,15 +275,15 @@ def _construct_neon(self):
 
         @wp.func
         def apply_bc(
-            index: Any,
-            timestep: Any,
-            _boundary_id: Any,
-            missing_mask: Any,
-            f_0: Any,
-            f_1: Any,
-            f_pre: Any,
-            f_post: Any,
-            is_post_streaming: bool,
+                index: Any,
+                timestep: Any,
+                _boundary_id: Any,
+                missing_mask: Any,
+                f_0: Any,
+                f_1: Any,
+                f_pre: Any,
+                f_post: Any,
+                is_post_streaming: bool,
         ):
             f_result = f_post
 
@@ -285,11 +292,15 @@ def apply_bc(
                 if is_post_streaming:
                     if wp.static(self.boundary_conditions[i].implementation_step == ImplementationStep.STREAMING):
                         if _boundary_id == wp.static(self.boundary_conditions[i].id):
-                            f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep, missing_mask, f_0, f_1, f_pre, f_post)
+                            f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep,
+                                                                                              missing_mask, f_0, f_1,
+                                                                                              f_pre, f_post)
                 else:
                     if wp.static(self.boundary_conditions[i].implementation_step == ImplementationStep.COLLISION):
                         if _boundary_id == wp.static(self.boundary_conditions[i].id):
-                            f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep, missing_mask, f_0, f_1, f_pre, f_post)
+                            f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep,
+                                                                                              missing_mask, f_0, f_1,
+                                                                                              f_pre, f_post)
                     if wp.static(self.boundary_conditions[i].id in extrapolation_outflow_bc_ids):
                         if _boundary_id == wp.static(self.boundary_conditions[i].id):
                             f_result = wp.static(self.boundary_conditions[i].prepare_bc_auxilary_data)(
@@ -299,10 +310,10 @@ def apply_bc(
 
         @wp.func
         def neon_get_thread_data(
-            f0_pn: Any,
-            f1_pn: Any,
-            missing_mask_pn: Any,
-            index: Any,
+                f0_pn: Any,
+                f1_pn: Any,
+                missing_mask_pn: Any,
+                index: Any,
         ):
             # Read thread data for populations
             _f0_thread = _f_vec()
@@ -318,7 +329,6 @@ def neon_get_thread_data(
 
         import typing
 
-
         @neon.Container.factory(name="collide_coarse")
         def collide_coarse(
                 level: int,
@@ -332,21 +342,28 @@ def collide_coarse(
             num_levels = f_0_fd.get_grid().get_num_levels()
 
             # module op to define odd of even iteration
-            even_itertation = wp.mod(timestep, 2)==0
+            even_itertation = wp.mod(timestep, 2) == 0
 
             def ll_collide_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
-                f_0_pn=loader.get_mres_read_handle(f_0_fd)
-                bc_mask_pn=loader.get_mres_read_handle(bc_mask_fd)
-                missing_mask_pn=loader.get_mres_read_handle(missing_mask_fd)
-                f_1_pn =loader.get_mres_write_handle(f_1_fd)
+                if level + 1 < f_0_fd.get_grid().get_num_levels():
+                    f_0_pn = loader.get_mres_write_handle(f_0_fd, neon.Loader.Operation.stencil_up)
+                    f_1_pn = loader.get_mres_write_handle(f_1_fd, neon.Loader.Operation.stencil_up)
+                else:
+                    f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                    f_1_pn = loader.get_mres_write_handle(f_1_fd)
+
+                # fake loading to enforce sequential step
+
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
 
                 _c = self.velocity_set.c
                 _w = self.velocity_set.w
 
                 @wp.func
-                def cl_collide_coarse(index: typing.Any):
+                def device(index: typing.Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     """
                     The c++ version starts with the following, which I am not sure is right:
@@ -359,7 +376,8 @@ def cl_collide_coarse(index: typing.Any):
                     if not wp.neon_has_child(f_0_pn, index):
 
                         # Read thread data for populations, these are post streaming
-                        _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
+                        _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn,
+                                                                                     index)
                         _f_post_stream = _f0_thread
 
                         _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
@@ -367,13 +385,13 @@ def cl_collide_coarse(index: typing.Any):
                         _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
 
                         # Apply post-collision boundary conditions
-                        #_f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False)
+                        # _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False)
 
                         for l in range(self.velocity_set.q):
                             push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]),
                                                              wp.int8(_c[1, l]),
                                                              wp.int8(_c[2, l]))
-                            if(level < num_levels - 1):
+                            if (level < num_levels - 1):
                                 val = _f_post_collision[l]
                                 wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, val)
                                 wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, val)
@@ -384,26 +402,28 @@ def cl_collide_coarse(index: typing.Any):
                             wp.neon_write(f_1_pn, index, l, self.compute_dtype(0))
                             wp.neon_write(f_0_pn, index, l, self.compute_dtype(0))
 
-                loader.declare_kernel(cl_collide_coarse)
+                loader.declare_kernel(device)
+
             return ll_collide_coarse
 
         @neon.Container.factory(name="stream_coarse_step_A")
         def stream_coarse_step_A(
-            level: int,
-            f_0_fd: Any,
-            f_1_fd: Any,
-            bc_mask_fd: Any,
-            missing_mask_fd: Any,
-            omega: Any,
-            timestep: int,
+                level: int,
+                f_0_fd: Any,
+                f_1_fd: Any,
+                bc_mask_fd: Any,
+                missing_mask_fd: Any,
+                omega: Any,
+                timestep: int,
         ):
             num_levels = f_0_fd.get_grid().get_num_levels()
+
             # if level != 0:
             #     # throw an exception
             #     raise Exception("Only the finest level is supported for now")
 
             # module op to define odd of even iteration
-            #od_or_even = wp.module("odd_or_even", "even")
+            # od_or_even = wp.module("odd_or_even", "even")
 
             def ll_stream_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
@@ -438,7 +458,7 @@ def cl_stream_coarse(index: typing.Any):
 
                     for l in range(self.velocity_set.q):
                         wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
-                    #wp.print("stream_coarse")
+                    # wp.print("stream_coarse")
 
                 loader.declare_kernel(cl_stream_coarse)
 
@@ -446,13 +466,13 @@ def cl_stream_coarse(index: typing.Any):
 
         @neon.Container.factory(name="stream_coarse_step_B")
         def stream_coarse_step_B(
-            level: int,
-            f_0_fd: Any,
-            f_1_fd: Any,
-            bc_mask_fd: Any,
-            missing_mask_fd: Any,
-            omega: Any,
-            timestep: int,
+                level: int,
+                f_0_fd: Any,
+                f_1_fd: Any,
+                bc_mask_fd: Any,
+                missing_mask_fd: Any,
+                omega: Any,
+                timestep: int,
         ):
             def ll_stream_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
@@ -462,7 +482,7 @@ def ll_stream_coarse(loader: neon.Loader):
 
                 bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
                 missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
-                coalescence_factor_pn  = loader.get_mres_read_handle(coalescence_factor_fd)
+                coalescence_factor_pn = loader.get_mres_read_handle(coalescence_factor_fd)
 
                 _c = self.velocity_set.c
                 _w = self.velocity_set.w
@@ -478,7 +498,6 @@ def cl_stream_coarse(index: typing.Any):
                         # HERE: we are a halo cell so we just exit
                         return
 
-
                     for l in range(self.velocity_set.q):
                         if l == 9:
                             # HERE, we skip the center direction
@@ -489,7 +508,8 @@ def cl_stream_coarse(index: typing.Any):
                                                          wp.int8(-_c[2, l]))
 
                         has_ngh_at_same_level = wp.bool(False)
-                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
+                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0),
+                                                       has_ngh_at_same_level)
 
                         # if (!pin.hasChildren(cell, dir)) {
                         if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
@@ -500,7 +520,9 @@ def cl_stream_coarse(index: typing.Any):
                                 if wp.neon_has_parent(f_0_pn, index):
                                     # YES halo cell on top of us
                                     has_a_courser_ngh = wp.bool(False)
-                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_courser_ngh)
+                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(f_0_pn, index, pull_direction, l,
+                                                                                self.compute_dtype(0),
+                                                                                has_a_courser_ngh)
                                     if has_a_courser_ngh:
                                         # Full state:
                                         # NO finer ngh. in the pull direction (opposite of l)
@@ -525,26 +547,25 @@ def cl_stream_coarse(index: typing.Any):
                                 # YES ngh. at the same level
                                 # -> **Coalescence**
                                 coalescence_factor = wp.neon_read(coalescence_factor_pn, index, l)
-                                accumulated = accumulated *coalescence_factor
+                                accumulated = accumulated * coalescence_factor
                                 wp.neon_write(f_1_pn, index, l, accumulated)
 
                             else:
                                 wp.print("ERRRRRRORRRRRRRRRRRRRR")
 
-
                 loader.declare_kernel(cl_stream_coarse)
 
             return ll_stream_coarse
 
-        @neon.Container.factory(name="stream_coarse")
+        @neon.Container.factory(name="stream_coarse_step_C")
         def stream_coarse_step_C(
-            level: int,
-            f_0_fd: Any,
-            f_1_fd: Any,
-            bc_mask_fd: Any,
-            missing_mask_fd: Any,
-            omega: Any,
-            timestep: int,
+                level: int,
+                f_0_fd: Any,
+                f_1_fd: Any,
+                bc_mask_fd: Any,
+                missing_mask_fd: Any,
+                omega: Any,
+                timestep: int,
         ):
 
             def ll_stream_coarse(loader: neon.Loader):
@@ -576,7 +597,6 @@ def cl_stream_coarse(index: typing.Any):
                     _f_post_collision = _f0_thread
                     _f_post_stream = _f1_thread
 
-
                     # do non mres post-streaming corrections
                     _f_post_stream = apply_bc(
                         index, timestep,
@@ -594,13 +614,13 @@ def cl_stream_coarse(index: typing.Any):
             return ll_stream_coarse
 
         return None, {
-            #"single_step_finest": single_step_finest,
+            # "single_step_finest": single_step_finest,
             "collide_coarse": collide_coarse,
             "stream_coarse_step_A": stream_coarse_step_A,
             "stream_coarse_step_B": stream_coarse_step_B,
             "stream_coarse_step_C": stream_coarse_step_C}
 
-    def get_containers(self, target_level,  f_0, f_1, bc_mask, missing_mask,  omega, timestep):
+    def get_containers(self, target_level, f_0, f_1, bc_mask, missing_mask, omega, timestep):
         containers = {'even': {}, 'odd': {}}
         _, container = self._construct_neon()
         for key in container.keys():
@@ -609,14 +629,17 @@ def get_containers(self, target_level,  f_0, f_1, bc_mask, missing_mask,  omega,
         return containers
 
     def init_containers(self):
-        self.containers=None
+        self.containers = None
         _, self.containers = self._construct_neon()
 
-    def launch_container(self, streamId, op_name, mres_level, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
+    def launch_container(self, streamId, op_name, mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep):
         self.containers[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep).run(0)
 
+    def add_to_app(self, app, op_name, mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep):
+        app.append(self.containers[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep))
+
     @Operator.register_backend(ComputeBackend.NEON)
-    def neon_launch(self, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
+    def neon_launch(self, f_0, f_1, bc_mask, missing_mask, omega, timestep):
         c = self.neon_container(f_0, f_1, bc_mask, missing_mask, omega, timestep)
         c.run(0)
         return f_0, f_1
diff --git a/xlb/operator/stepper/nse_multires_stepper_vk.py b/xlb/operator/stepper/nse_multires_stepper_vk.py
index 64fa668e..3e45f95b 100644
--- a/xlb/operator/stepper/nse_multires_stepper_vk.py
+++ b/xlb/operator/stepper/nse_multires_stepper_vk.py
@@ -1,5 +1,5 @@
 # Base class for all stepper operators
-
+import typing
 from functools import partial
 
 from docutils.nodes import container
@@ -455,6 +455,9 @@ def init_containers(self):
     def launch_container(self, streamId, op_name, mres_level, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
         self.containers[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep).run(0)
 
+    def add_to_app(self, app:typing.List, op_name, mres_level, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
+        app.append(self.containers[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep))
+
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_launch(self, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
         #if self.c is None:

From b91acd48bbb0007dca671ea3159dfc9cd656c4d6 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 15 May 2025 12:37:16 +0200
Subject: [PATCH 029/208] Printing stats.

---
 .../3_levels_mlups_3d_multires_solver.py      | 23 +++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/examples/performance/3_levels_mlups_3d_multires_solver.py b/examples/performance/3_levels_mlups_3d_multires_solver.py
index c7cd9ba3..eeb030ae 100644
--- a/examples/performance/3_levels_mlups_3d_multires_solver.py
+++ b/examples/performance/3_levels_mlups_3d_multires_solver.py
@@ -112,7 +112,8 @@ def get_peeled_np(level, width):
     l1 = get_peeled_np(1, 7)
     l2 = get_peeled_np(2, 4)
 
-    lastLevel = 3
+    num_levels = 4
+    lastLevel = num_levels -1
     divider = 2**lastLevel
     m = neon.Index_3d(dim.x // divider +1, dim.y // divider+1, dim.z // divider+1)
     lastLevel = np.ones((m.x, m.y, m.z), dtype=int)
@@ -156,21 +157,23 @@ def get_peeled_np(level, width):
 
     print("start timing")
     wp.synchronize()
-    start_time = time.time()
 
+    start_time = time.time()
     for i in range(num_steps):
         sim.step()
         if i%100 == 0:
             print(f"step {i}")
         #    sim.export_macroscopic("u_lid_driven_cavity_")
+    wp.synchronize()
     t = time.time() - start_time
+    print(f"Timing  {t}")
 
     sim.export_macroscopic("u_lid_driven_cavity_")
-    return t
+    return {"time":t, "num_levels":num_levels}
 
 
-def calculate_mlups(cube_edge, num_steps, elapsed_time):
-    num_step_finer = num_steps * 2**(4-1)
+def calculate_mlups(cube_edge, num_steps, elapsed_time, num_levels):
+    num_step_finer = num_steps * 2**(num_levels-1)
     total_lattice_updates = cube_edge**3 * num_step_finer
     mlups = (total_lattice_updates / elapsed_time) / 1e6
     return mlups
@@ -193,11 +196,13 @@ def main():
     args = parse_arguments()
     backend, precision_policy = setup_simulation(args)
     grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
-    elapsed_time = run(backend, precision_policy, grid_shape, args.num_steps)
-    mlups = calculate_mlups(args.cube_edge, args.num_steps, elapsed_time)
+    stats = run(backend, precision_policy, grid_shape, args.num_steps)
+    mlups = calculate_mlups(args.cube_edge, args.num_steps, stats['time'], stats['num_levels'])
 
-    print(f"Simulation completed in {elapsed_time:.2f} seconds")
-    print(f"MLUPs: {mlups:.2f}")
+    print(f"Simulation completed in {stats['time']:.2f} seconds")
+    print(f"Number of levels {stats['num_levels']}")
+    print(f"Cube edge {args.cube_edge}")
+    print(f"EMLUPs: {mlups:.2f}")
 
 
 if __name__ == "__main__":

From cc85c9707c98779692cc6f570d5c594d816a67a7 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 15 May 2025 14:23:06 +0200
Subject: [PATCH 030/208] Printing stats.

---
 .../performance/3_levels_mlups_3d_multires_solver.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/examples/performance/3_levels_mlups_3d_multires_solver.py b/examples/performance/3_levels_mlups_3d_multires_solver.py
index eeb030ae..dc3eee6f 100644
--- a/examples/performance/3_levels_mlups_3d_multires_solver.py
+++ b/examples/performance/3_levels_mlups_3d_multires_solver.py
@@ -152,8 +152,8 @@ def get_peeled_np(level, width):
 
     sim = xlb.helper.Nse_multires_simulation(grid, velocity_set, stepper, omega)
 
-    sim.export_macroscopic("Initial_")
-    sim.step()
+    # sim.export_macroscopic("Initial_")
+    # sim.step()
 
     print("start timing")
     wp.synchronize()
@@ -176,7 +176,7 @@ def calculate_mlups(cube_edge, num_steps, elapsed_time, num_levels):
     num_step_finer = num_steps * 2**(num_levels-1)
     total_lattice_updates = cube_edge**3 * num_step_finer
     mlups = (total_lattice_updates / elapsed_time) / 1e6
-    return mlups
+    return {"EMLUPS":mlups, "finer_steps":num_step_finer}
 
 
     # # remove boundary cells
@@ -197,12 +197,14 @@ def main():
     backend, precision_policy = setup_simulation(args)
     grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
     stats = run(backend, precision_policy, grid_shape, args.num_steps)
-    mlups = calculate_mlups(args.cube_edge, args.num_steps, stats['time'], stats['num_levels'])
+    mlups_stats = calculate_mlups(args.cube_edge, args.num_steps, stats['time'], stats['num_levels'])
 
     print(f"Simulation completed in {stats['time']:.2f} seconds")
     print(f"Number of levels {stats['num_levels']}")
     print(f"Cube edge {args.cube_edge}")
-    print(f"EMLUPs: {mlups:.2f}")
+    print(f"Coarse Iterations {args.num_steps}")
+    print(f"Fine Iterations {lups_stats["finer_steps"]}")
+    print(f"EMLUPs: {mlups_stats["EMLUPS"]:.2f}")
 
 
 if __name__ == "__main__":

From 1c6aca352afba672b8e96a4f98f281ce21a65052 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 15 May 2025 14:28:35 +0200
Subject: [PATCH 031/208] Printing stats.

---
 examples/performance/3_levels_mlups_3d_multires_solver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/performance/3_levels_mlups_3d_multires_solver.py b/examples/performance/3_levels_mlups_3d_multires_solver.py
index dc3eee6f..eb65d50f 100644
--- a/examples/performance/3_levels_mlups_3d_multires_solver.py
+++ b/examples/performance/3_levels_mlups_3d_multires_solver.py
@@ -203,7 +203,7 @@ def main():
     print(f"Number of levels {stats['num_levels']}")
     print(f"Cube edge {args.cube_edge}")
     print(f"Coarse Iterations {args.num_steps}")
-    print(f"Fine Iterations {lups_stats["finer_steps"]}")
+    print(f"Fine Iterations {mlups_stats["finer_steps"]}")
     print(f"EMLUPs: {mlups_stats["EMLUPS"]:.2f}")
 
 

From a2aad437b01b2dff0c2699222bb1941a682e8ad0 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 15 May 2025 14:30:43 +0200
Subject: [PATCH 032/208] Printing stats.

---
 examples/performance/3_levels_mlups_3d_multires_solver.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/performance/3_levels_mlups_3d_multires_solver.py b/examples/performance/3_levels_mlups_3d_multires_solver.py
index eb65d50f..8dcdebd8 100644
--- a/examples/performance/3_levels_mlups_3d_multires_solver.py
+++ b/examples/performance/3_levels_mlups_3d_multires_solver.py
@@ -203,7 +203,8 @@ def main():
     print(f"Number of levels {stats['num_levels']}")
     print(f"Cube edge {args.cube_edge}")
     print(f"Coarse Iterations {args.num_steps}")
-    print(f"Fine Iterations {mlups_stats["finer_steps"]}")
+    finer_steps = mlups_stats["finer_steps"]
+    print(f"Fine Iterations {finer_steps}")
     print(f"EMLUPs: {mlups_stats["EMLUPS"]:.2f}")
 
 

From a165017040c155dde417663590af4f4ae3de31a6 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 15 May 2025 14:32:25 +0200
Subject: [PATCH 033/208] Printing stats.

---
 examples/performance/3_levels_mlups_3d_multires_solver.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/performance/3_levels_mlups_3d_multires_solver.py b/examples/performance/3_levels_mlups_3d_multires_solver.py
index 8dcdebd8..341fe73b 100644
--- a/examples/performance/3_levels_mlups_3d_multires_solver.py
+++ b/examples/performance/3_levels_mlups_3d_multires_solver.py
@@ -205,7 +205,8 @@ def main():
     print(f"Coarse Iterations {args.num_steps}")
     finer_steps = mlups_stats["finer_steps"]
     print(f"Fine Iterations {finer_steps}")
-    print(f"EMLUPs: {mlups_stats["EMLUPS"]:.2f}")
+    EMLUPS = mlups_stats["EMLUPS"]
+    print(f"EMLUPs: {EMLUPS:.2f}")
 
 
 if __name__ == "__main__":

From 558b044dfe48b5712db6e49effdf88c2ee677b32 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 15 May 2025 20:37:22 +0200
Subject: [PATCH 034/208] Cleaning.

---
 xlb/helper/nse_multires_solver.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/xlb/helper/nse_multires_solver.py b/xlb/helper/nse_multires_solver.py
index e377572e..e2e40245 100644
--- a/xlb/helper/nse_multires_solver.py
+++ b/xlb/helper/nse_multires_solver.py
@@ -112,7 +112,7 @@ def recurtion(level, app):
                 bc_mask=self.bc_mask,
                 missing_mask=self.missing_mask,
                 omega=self.omega,
-                timestep=iteration_id,
+                timestep=0,
             )
             # if(level == 0):
             #     wp.synchronize()
@@ -140,7 +140,7 @@ def recurtion(level, app):
                 bc_mask=self.bc_mask,
                 missing_mask=self.missing_mask,
                 omega=self.omega,
-                timestep=iteration_id,
+                timestep=0,
             )
             print(f"RECURTION Level {level}, stream_coarse_step_B")
 
@@ -153,7 +153,7 @@ def recurtion(level, app):
                 bc_mask=self.bc_mask,
                 missing_mask=self.missing_mask,
                 omega=self.coalescence_factor,
-                timestep=iteration_id,
+                timestep=0,
             )
 
             print(f"RECURTION Level {level}, stream_coarse_step_C")
@@ -167,7 +167,7 @@ def recurtion(level, app):
                 bc_mask=self.bc_mask,
                 missing_mask=self.missing_mask,
                 omega=self.omega,
-                timestep=iteration_id,
+                timestep=0,
             )
             # if(level == 1):
             #     wp.synchronize()
@@ -182,9 +182,6 @@ def recurtion(level, app):
             #     sys.exit()
             #     pass
 
-        self.iteration_idx += 1
-        iteration_id = self.iteration_idx % 2
-
         recurtion(self.count_levels - 1, app=self.app)
         bk = self.grid.get_neon_backend()
         self.sk = neon.Skeleton(backend=bk)

From 7dac5226d101a8cf57c5a23bb26a06f8b3500735 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 15 May 2025 20:37:37 +0200
Subject: [PATCH 035/208] Cleaning.

---
 examples/performance/3_levels_mlups_3d_multires_solver.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/examples/performance/3_levels_mlups_3d_multires_solver.py b/examples/performance/3_levels_mlups_3d_multires_solver.py
index 341fe73b..e48a6f4e 100644
--- a/examples/performance/3_levels_mlups_3d_multires_solver.py
+++ b/examples/performance/3_levels_mlups_3d_multires_solver.py
@@ -146,10 +146,6 @@ def get_peeled_np(level, width):
     omega = 1.0 / (3.0 * visc + 0.5)
     #omega = 1.0
 
-
-    # # Initialize fields and run simulation
-    # omega = 1.0
-
     sim = xlb.helper.Nse_multires_simulation(grid, velocity_set, stepper, omega)
 
     # sim.export_macroscopic("Initial_")
@@ -157,7 +153,6 @@ def get_peeled_np(level, width):
 
     print("start timing")
     wp.synchronize()
-
     start_time = time.time()
     for i in range(num_steps):
         sim.step()

From 92c97b8954c045c191b343f176382636a3cc76cb Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Mon, 19 May 2025 10:51:25 +0200
Subject: [PATCH 036/208] Fusion

---
 xlb/helper/nse_multires_solver.py             |  30 +-
 xlb/operator/stepper/nse_multires_stepper.py  | 138 +++++
 .../stepper/nse_multires_stepper_vk.py        | 484 ------------------
 3 files changed, 153 insertions(+), 499 deletions(-)
 delete mode 100644 xlb/operator/stepper/nse_multires_stepper_vk.py

diff --git a/xlb/helper/nse_multires_solver.py b/xlb/helper/nse_multires_solver.py
index e2e40245..13bf71fe 100644
--- a/xlb/helper/nse_multires_solver.py
+++ b/xlb/helper/nse_multires_solver.py
@@ -130,23 +130,10 @@ def recurtion(level, app):
             recurtion(level - 1, app)
             recurtion(level - 1, app)
 
-            print(f"RECURTION Level {level}, stream_coarse_step_A")
+            print(f"RECURTION Level {level}, stream_coarse_step_ABC")
             self.stepper.add_to_app(
                 app=app,
-                op_name="stream_coarse_step_A",
-                mres_level=level,
-                f_0=self.f_1,
-                f_1=self.f_0,
-                bc_mask=self.bc_mask,
-                missing_mask=self.missing_mask,
-                omega=self.omega,
-                timestep=0,
-            )
-            print(f"RECURTION Level {level}, stream_coarse_step_B")
-
-            self.stepper.add_to_app(
-                app=app,
-                op_name="stream_coarse_step_B",
+                op_name="stream_coarse_step_ABC",
                 mres_level=level,
                 f_0=self.f_1,
                 f_1=self.f_0,
@@ -155,6 +142,19 @@ def recurtion(level, app):
                 omega=self.coalescence_factor,
                 timestep=0,
             )
+            # print(f"RECURTION Level {level}, stream_coarse_step_B")
+            #
+            # self.stepper.add_to_app(
+            #     app=app,
+            #     op_name="stream_coarse_step_B",
+            #     mres_level=level,
+            #     f_0=self.f_1,
+            #     f_1=self.f_0,
+            #     bc_mask=self.bc_mask,
+            #     missing_mask=self.missing_mask,
+            #     omega=self.coalescence_factor,
+            #     timestep=0,
+            # )
 
             print(f"RECURTION Level {level}, stream_coarse_step_C")
 
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 6a6bd01c..36207d79 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -406,6 +406,143 @@ def device(index: typing.Any):
 
             return ll_collide_coarse
 
+        @neon.Container.factory(name="stream_coarse_step_ABC")
+        def stream_coarse_step_ABC(
+                level: int,
+                f_0_fd: Any,
+                f_1_fd: Any,
+                bc_mask_fd: Any,
+                missing_mask_fd: Any,
+                omega: Any,
+                timestep: int,
+        ):
+            num_levels = f_0_fd.get_grid().get_num_levels()
+
+            # if level != 0:
+            #     # throw an exception
+            #     raise Exception("Only the finest level is supported for now")
+
+            # module op to define odd of even iteration
+            # od_or_even = wp.module("odd_or_even", "even")
+
+            def ll_stream_coarse(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+
+                f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                f_1_pn = loader.get_mres_write_handle(f_1_fd)
+
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
+
+                _c = self.velocity_set.c
+
+                coalescence_factor_fd = omega
+                coalescence_factor_pn = loader.get_mres_read_handle(coalescence_factor_fd)
+
+
+                @wp.func
+                def cl_stream_coarse(index: typing.Any):
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    if _boundary_id == wp.uint8(255):
+                        return
+
+                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
+                    if are_we_a_halo_cell:
+                        # HERE: we are a halo cell so we just exit
+                        return
+
+                    # do stream normally
+                    _missing_mask = _missing_mask_vec()
+                    _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn,
+                                                                                 f_1_pn,
+                                                                                 missing_mask_pn,
+                                                                                 index)
+                    _f_post_collision = _f0_thread
+                    _f_post_stream = self.stream.neon_functional(f_0_pn, index)
+
+                    for l in range(self.velocity_set.q):
+                        if l == 9:
+                            # HERE, we skip the center direction
+                            continue
+
+                        pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]),
+                                                         wp.int8(-_c[1, l]),
+                                                         wp.int8(-_c[2, l]))
+
+                        has_ngh_at_same_level = wp.bool(False)
+                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0),
+                                                       has_ngh_at_same_level)
+
+                        # if (!pin.hasChildren(cell, dir)) {
+                        if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
+                            # NO finer ngh. in the pull direction (opposite of l)
+                            if not has_ngh_at_same_level:
+                                # NO ngh. at the same level
+                                # COULD we have a ngh. at the courser level?
+                                if wp.neon_has_parent(f_0_pn, index):
+                                    # YES halo cell on top of us
+                                    has_a_courser_ngh = wp.bool(False)
+                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(f_0_pn, index, pull_direction, l,
+                                                                                self.compute_dtype(0),
+                                                                                has_a_courser_ngh)
+                                    if has_a_courser_ngh:
+                                        # Full state:
+                                        # NO finer ngh. in the pull direction (opposite of l)
+                                        # NO ngh. at the same level
+                                        # YES ghost cell on top of us
+                                        # YES courser ngh.
+                                        # -> **Explosion**
+                                        # wp.neon_write(f_1_pn, index, l, exploded_pop)
+                                        _f_post_stream[l] = exploded_pop
+                        else:
+                            # HERE -> I have a finer ngh. in direction pull (opposite l)
+                            # Then I have to read from the halo on top of my finer ngh.
+                            if has_ngh_at_same_level:
+                                # if l == 10:
+                                #     wp.print(accumulated)
+                                #     glob = wp.neon_global_idx(f_1_pn, index)
+                                #     wp.neon_cuda_info()
+                                #     wp.neon_print(glob)
+                                #     wp.neon_level(f_1_pn)
+                                # accumulated = _w[l]
+                                # Full State
+                                # YES finer ngh. in the pull direction (opposite of l)
+                                # YES ngh. at the same level
+                                # -> **Coalescence**
+                                coalescence_factor = wp.neon_read(coalescence_factor_pn, index, l)
+                                accumulated = accumulated * coalescence_factor
+                                #wp.neon_write(f_1_pn, index, l, accumulated)
+                                _f_post_stream[l] = accumulated
+                            else:
+                                wp.print("ERRRRRRORRRRRRRRRRRRRR")
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+                    for l in range(self.velocity_set.q):
+                        wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
+                    # wp.print("stream_coarse")
+
+                loader.declare_kernel(cl_stream_coarse)
+
+            return ll_stream_coarse
+
         @neon.Container.factory(name="stream_coarse_step_A")
         def stream_coarse_step_A(
                 level: int,
@@ -616,6 +753,7 @@ def cl_stream_coarse(index: typing.Any):
         return None, {
             # "single_step_finest": single_step_finest,
             "collide_coarse": collide_coarse,
+            "stream_coarse_step_ABC": stream_coarse_step_ABC,
             "stream_coarse_step_A": stream_coarse_step_A,
             "stream_coarse_step_B": stream_coarse_step_B,
             "stream_coarse_step_C": stream_coarse_step_C}
diff --git a/xlb/operator/stepper/nse_multires_stepper_vk.py b/xlb/operator/stepper/nse_multires_stepper_vk.py
deleted file mode 100644
index 3e45f95b..00000000
--- a/xlb/operator/stepper/nse_multires_stepper_vk.py
+++ /dev/null
@@ -1,484 +0,0 @@
-# Base class for all stepper operators
-import typing
-from functools import partial
-
-from docutils.nodes import container
-from jax import jit
-import warp as wp
-import neon
-from typing import Any
-
-from xlb import DefaultConfig
-from xlb.compute_backend import ComputeBackend
-from xlb.precision_policy import Precision
-from xlb.operator import Operator
-from xlb.operator.stream import Stream
-from xlb.operator.collision import BGK, KBC
-from xlb.operator.equilibrium import QuadraticEquilibrium
-from xlb.operator.macroscopic import Macroscopic
-from xlb.operator.stepper import Stepper
-from xlb.operator.boundary_condition.boundary_condition import ImplementationStep
-from xlb.operator.boundary_condition.boundary_condition_registry import boundary_condition_registry
-from xlb.operator.collision import ForcedCollision
-from xlb.operator.boundary_masker import IndicesBoundaryMasker, MeshBoundaryMasker
-from xlb.helper import check_bc_overlaps
-
-
-class MultiresIncompressibleNavierStokesStepper(Stepper):
-    def __init__(
-        self,
-        grid,
-        boundary_conditions=[],
-        collision_type="BGK",
-        forcing_scheme="exact_difference",
-        force_vector=None,
-    ):
-        super().__init__(grid, boundary_conditions)
-        self.odd_or_even='even'
-        self.c_even = None
-        self.c_odd = None
-
-        # Construct the collision operator
-        if collision_type == "BGK":
-            self.collision = BGK(self.velocity_set, self.precision_policy, self.compute_backend)
-        elif collision_type == "KBC":
-            self.collision = KBC(self.velocity_set, self.precision_policy, self.compute_backend)
-
-        if force_vector is not None:
-            self.collision = ForcedCollision(collision_operator=self.collision, forcing_scheme=forcing_scheme, force_vector=force_vector)
-
-        # Construct the operators
-        self.stream = Stream(self.velocity_set, self.precision_policy, self.compute_backend)
-        self.equilibrium = QuadraticEquilibrium(self.velocity_set, self.precision_policy, self.compute_backend)
-        self.macroscopic = Macroscopic(self.velocity_set, self.precision_policy, self.compute_backend)
-
-    def prepare_fields(self, rho, u, initializer=None):
-        """Prepare the fields required for the stepper.
-
-        Args:
-            initializer: Optional operator to initialize the distribution functions.
-                        If provided, it should be a callable that takes (grid, velocity_set,
-                        precision_policy, compute_backend) as arguments and returns initialized f_0.
-                        If None, default equilibrium initialization is used with rho=1 and u=0.
-
-        Returns:
-            Tuple of (f_0, f_1, bc_mask, missing_mask):
-                - f_0: Initial distribution functions
-                - f_1: Copy of f_0 for double-buffering
-                - bc_mask: Boundary condition mask indicating which BC applies to each node
-                - missing_mask: Mask indicating which populations are missing at boundary nodes
-        """
-
-        f_0 = self.grid.create_field(cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision)
-        f_1 = self.grid.create_field(cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision)
-        missing_mask = self.grid.create_field(cardinality=self.velocity_set.q, dtype=Precision.UINT8)
-        bc_mask = self.grid.create_field(cardinality=1, dtype=Precision.UINT8)
-
-        from xlb.helper.initializers import initialize_multires_eq
-        f_0 = initialize_multires_eq(f_0, self.grid, self.velocity_set, self.precision_policy, self.compute_backend, rho=rho, u=u)
-
-        for level in range(self.grid.count_levels):
-            f_1.copy_from_run(level, f_0, 0)
-        f_0.update_host(0)
-        wp.synchronize()
-        f_0.export_vti("f0_eq_init.vti", "init_f0")
-
-        # Process boundary conditions and update masks
-        bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, bc_mask, missing_mask, xlb_grid=self.grid)
-        # Initialize auxiliary data if needed
-        f_0, f_1 = self._initialize_auxiliary_data(self.boundary_conditions, f_0, f_1, bc_mask, missing_mask)
-        # bc_mask.update_host(0)
-        bc_mask.update_host(0)
-        f_0.update_host(0)
-        wp.synchronize()
-        bc_mask.export_vti("bc_mask.vti", 'bc_mask')
-        f_0.export_vti("init_f0.vti", 'init_f0')
-
-        #missing_mask.export_vti("missing_mask.vti", 'missing_mask')
-
-        return f_0, f_1, bc_mask, missing_mask
-
-    @classmethod
-    def _process_boundary_conditions(cls, boundary_conditions, bc_mask, missing_mask, xlb_grid=None):
-        """Process boundary conditions and update boundary masks."""
-        # Check for boundary condition overlaps
-        check_bc_overlaps(boundary_conditions, DefaultConfig.velocity_set.d, DefaultConfig.default_backend)
-        # Create boundary maskers
-        indices_masker = IndicesBoundaryMasker(
-            velocity_set=DefaultConfig.velocity_set,
-            precision_policy=DefaultConfig.default_precision_policy,
-            compute_backend=DefaultConfig.default_backend,
-        )
-        # Split boundary conditions by type
-        bc_with_vertices = [bc for bc in boundary_conditions if bc.mesh_vertices is not None]
-        bc_with_indices = [bc for bc in boundary_conditions if bc.indices is not None]
-        # Process indices-based boundary conditions
-        if bc_with_indices:
-            bc_mask, missing_mask = indices_masker(bc_with_indices, bc_mask, missing_mask, xlb_grid=xlb_grid)
-        # Process mesh-based boundary conditions for 3D
-        if DefaultConfig.velocity_set.d == 3 and bc_with_vertices:
-            # throw an exception because this option is not implemented yet
-            raise Exception("Mesh-based boundary conditions are not implemented yet")
-            # mesh_masker = MeshBoundaryMasker(
-            #     velocity_set=DefaultConfig.velocity_set,
-            #     precision_policy=DefaultConfig.default_precision_policy,
-            #     compute_backend=DefaultConfig.default_backend,
-            # )
-            # for bc in bc_with_vertices:
-            #     bc_mask, missing_mask = mesh_masker(bc, bc_mask, missing_mask)
-
-        return bc_mask, missing_mask
-
-    @staticmethod
-    def _initialize_auxiliary_data(boundary_conditions, f_0, f_1, bc_mask, missing_mask):
-        """Initialize auxiliary data for boundary conditions that require it."""
-        for bc in boundary_conditions:
-            if bc.needs_aux_init and not bc.is_initialized_with_aux_data:
-                f_0, f_1 = bc.aux_data_init(f_0, f_1, bc_mask, missing_mask)
-        return f_0, f_1
-
-    def _construct_neon(self):
-        # Set local constants
-        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
-        _missing_mask_vec = wp.vec(self.velocity_set.q, dtype=wp.uint8)
-        _opp_indices = self.velocity_set.opp_indices
-        #_cast_to_store_dtype = self.store_dtype()
-
-        # Read the list of bc_to_id created upon instantiation
-        bc_to_id = boundary_condition_registry.bc_to_id
-        id_to_bc = boundary_condition_registry.id_to_bc
-        _zero = self.compute_dtype(0)
-        # Gather IDs of ExtrapolationOutflowBC boundary conditions
-        extrapolation_outflow_bc_ids = []
-        for bc_name, bc_id in bc_to_id.items():
-            if bc_name.startswith("ExtrapolationOutflowBC"):
-                extrapolation_outflow_bc_ids.append(bc_id)
-        # Group active boundary conditions
-        active_bcs = set(boundary_condition_registry.id_to_bc[bc.id] for bc in self.boundary_conditions)
-
-        @wp.func
-        def apply_bc(
-            index: Any,
-            timestep: Any,
-            _boundary_id: Any,
-            missing_mask: Any,
-            f_0: Any,
-            f_1: Any,
-            f_pre: Any,
-            f_post: Any,
-            is_post_streaming: bool,
-        ):
-            f_result = f_post
-
-            # Unroll the loop over boundary conditions
-            for i in range(wp.static(len(self.boundary_conditions))):
-                if is_post_streaming:
-                    if wp.static(self.boundary_conditions[i].implementation_step == ImplementationStep.STREAMING):
-                        if _boundary_id == wp.static(self.boundary_conditions[i].id):
-                            f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep, missing_mask, f_0, f_1, f_pre, f_post)
-                else:
-                    if wp.static(self.boundary_conditions[i].implementation_step == ImplementationStep.COLLISION):
-                        if _boundary_id == wp.static(self.boundary_conditions[i].id):
-                            f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep, missing_mask, f_0, f_1, f_pre, f_post)
-                    if wp.static(self.boundary_conditions[i].id in extrapolation_outflow_bc_ids):
-                        if _boundary_id == wp.static(self.boundary_conditions[i].id):
-                            f_result = wp.static(self.boundary_conditions[i].prepare_bc_auxilary_data)(
-                                index, timestep, missing_mask, f_0, f_1, f_pre, f_post
-                            )
-            return f_result
-
-        @wp.func
-        def neon_get_thread_data(
-            f0_pn: Any,
-            f1_pn: Any,
-            missing_mask_pn: Any,
-            index: Any,
-        ):
-            # Read thread data for populations
-            _f0_thread = _f_vec()
-            _f1_thread = _f_vec()
-            _missing_mask = _missing_mask_vec()
-            for l in range(self.velocity_set.q):
-                # q-sized vector of pre-streaming populations
-                _f0_thread[l] = self.compute_dtype(wp.neon_read(f0_pn, index, l))
-                _f1_thread[l] = self.compute_dtype(wp.neon_read(f1_pn, index, l))
-                _missing_mask[l] = wp.neon_read(missing_mask_pn, index, l)
-
-            return _f0_thread, _f1_thread, _missing_mask
-
-        import typing
-        @neon.Container.factory(name="finest_collide")
-        def single_step_finest(
-                level: int,
-                f_0_fd: Any,
-                f_1_fd: Any,
-                bc_mask_fd: Any,
-                missing_mask_fd: Any,
-                omega: Any,
-                timestep: int,
-        ):
-            # if level != 0:
-            #     # throw an exception
-            #     raise Exception("Only the finest level is supported for now")
-
-            # module op to define odd of even iteration
-            od_or_even = wp.module("odd_or_even", "even")
-
-            def ll_single_step_finest(loader: neon.Loader):
-                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
-
-                f_0_pn=loader.get_mres_read_handle(f_0_fd)
-                bc_mask_pn=loader.get_mres_read_handle(bc_mask_fd)
-                missing_mask_pn=loader.get_mres_read_handle(missing_mask_fd)
-
-                f_1_pn =loader.get_mres_write_handle(f_1_fd)
-
-                @wp.func
-                def cl_single_step_finest(index: typing.Any):
-                    _c = self.velocity_set.c
-                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    if _boundary_id == wp.uint8(255):
-                        return
-
-                    # Read thread data for populations, these are post streaming
-                    _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
-                    _f_post_stream = _f0_thread
-
-                    _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
-                    _feq = self.equilibrium.neon_functional(_rho, _u)
-                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
-
-                    # Apply post-collision boundary conditions
-                    _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False)
-
-                    # Apply streaming boundary conditions
-                    _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, True)
-                    _opposite_c_idx = self.velocity_set.self.opp_indices
-
-                    for l in range(self.velocity_set.q):
-                        push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]),
-                                                   wp.int8(_c[1, l]),
-                                                   wp.int8(_c[2, l]))
-                        ## Store
-                        if od_or_even == 0:
-                            wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, _f_post_stream[l])
-                        else:
-                            wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction,_f_post_stream[l])
-
-                        ## Push stream
-                        is_active = wp.neon_is_active(f_0_pn, index, push_direction)
-                        if is_active:
-                            ngh_gidx = wp.neon_ngh_idx(f_0_pn, index, push_direction)
-                            ngh_boundary_id = wp.neon_read(bc_mask_pn, ngh_gidx, 0)
-                            ## WHAT IS BULK?
-                            if ngh_boundary_id == BULK:
-                                wp.neon_write(f_1_pn, ngh_gidx, l, _f_post_stream[l])
-                            else:
-                                opposite_l = _opp_indices[l]
-                                wp.neon_write(f_1_pn, index, opposite_l, _f_post_stream[l])
-                        else:
-                            if wp.int8(_c[0, l]) != 0 and wp.int8(_c[1, l]) != 0 and wp.int8(_c[2, l]) != 0:
-                                opposite_l = _opp_indices[l]
-                                is_valid = False
-                                value = self.compute_dtype(0)
-                                if od_or_even == 0:
-                                    value = wp.neon_read_uncle(f_1_pn, index, push_direction, opposite_l, value, is_valid)
-                                else:
-                                    value = wp.neon_read_uncle(f_0_pn, index, push_direction, opposite_l, value, is_valid)
-                                if is_valid:
-                                    wp.neon_write(f_1_pn, index, l, _f_post_stream[l], value)
-
-                loader.declare_kernel(cl_single_step_finest)
-            return ll_single_step_finest
-
-        @neon.Container.factory(name="collide_coarse")
-        def collide_coarse(
-                level: int,
-                f_0_fd: Any,
-                f_1_fd: Any,
-                bc_mask_fd: Any,
-                missing_mask_fd: Any,
-                omega: Any,
-                timestep: int,
-        ):
-            num_levels = f_0_fd.get_grid().get_num_levels()
-
-            # module op to define odd of even iteration
-            even_itertation = wp.mod(timestep, 2)==0
-
-            def ll_collide_coarse(loader: neon.Loader):
-                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
-
-                f_0_pn=loader.get_mres_read_handle(f_0_fd)
-                bc_mask_pn=loader.get_mres_read_handle(bc_mask_fd)
-                missing_mask_pn=loader.get_mres_read_handle(missing_mask_fd)
-                f_1_pn =loader.get_mres_write_handle(f_1_fd)
-
-                _c = self.velocity_set.c
-
-                @wp.func
-                def cl_collide_coarse(index: typing.Any):
-                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    """
-                    The c++ version starts with the following, which I am not sure is right:
-                        if (type(cell, 0) == CellType::bulk ) {
-                    CB type cells should do collide too  
-                    """
-                    if _boundary_id == wp.uint8(255):
-                        return
-
-                    if not wp.neon_has_child(f_0_pn, index):
-
-                        # Read thread data for populations, these are post streaming
-                        _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
-                        _f_post_stream = _f0_thread
-
-                        _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
-                        _feq = self.equilibrium.neon_functional(_rho, _u)
-                        _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
-
-                        # Apply post-collision boundary conditions
-                        _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False)
-
-                        for l in range(self.velocity_set.q):
-                            push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
-                            if(level < num_levels - 1):
-                                ## Store
-                                # if even_itertation == 0:
-                                #     wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, _f_post_collision[l])
-                                # else:
-                                wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, _f_post_collision[l])
-
-                            wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
-                    else:
-                        for l in range(self.velocity_set.q):
-                            wp.neon_write(f_1_pn, index, l, self.compute_dtype(0))
-                    wp.print("stream_coarse")
-
-                loader.declare_kernel(cl_collide_coarse)
-            return ll_collide_coarse
-
-        @neon.Container.factory(name="stream_coarse")
-        def stream_coarse(
-            level: int,
-            f_0_fd: Any,
-            f_1_fd: Any,
-            bc_mask_fd: Any,
-            missing_mask_fd: Any,
-            omega: Any,
-            timestep: int,
-        ):
-            num_levels = f_0_fd.get_grid().get_num_levels()
-            # if level != 0:
-            #     # throw an exception
-            #     raise Exception("Only the finest level is supported for now")
-
-            # module op to define odd of even iteration
-            #od_or_even = wp.module("odd_or_even", "even")
-
-            def ll_stream_coarse(loader: neon.Loader):
-                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
-
-                f_0_pn = loader.get_mres_read_handle(f_0_fd)
-                f_1_pn = loader.get_mres_write_handle(f_1_fd)
-
-                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
-                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
-
-                _c = self.velocity_set.c
-
-                @wp.func
-                def cl_stream_coarse(index: typing.Any):
-                    _missing_mask = _missing_mask_vec()
-                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    if _boundary_id != wp.uint8(255):
-                        #  if (!pin.hasChildren(cell)) {
-                        if not wp.neon_has_child(f_0_pn, index):
-                            # do stream normally
-                            _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
-                            _f_post_stream = self.stream.neon_functional(f_0_pn, index)
-
-                            # do mres corrections
-                            for l in range(self.velocity_set.q):
-                                pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
-                                _missing_mask[l] = wp.neon_read(missing_mask_pn, index, l)
-
-                                #  if (!pin.hasChildren(cell, dir)) {
-                                if not wp.neon_has_child(f_0_pn, index, pull_direction):
-                                    #if (nghType.mIsValid) {
-                                    # NOTHING as taken  care after
-                                    # } else if (pin.hasParent(cell) && !(dir.x == 0 && dir.y == 0 && dir.z == 0)) {
-                                    if wp.neon_has_parent(f_0_pn, index):
-                                        if pull_direction.x != 0 or pull_direction.y != 0 or pull_direction.z != 0:
-                                            # is_valid = wp.bool(False)
-                                            # uncle_val = wp.neon_read_uncle(f_0_pn, index, pull_direction, l, self.compute_dtype(0), is_valid)
-                                            # if is_valid:
-                                            #     #_f_post_stream[l] = uncle_val
-                                            #     # HERE DB
-                                            _f_post_stream[l] =  self.compute_dtype(0.0)
-                                else:
-                                    is_valid = wp.bool(False)
-                                    read_accumulate_date = wp.neon_read_ngh(f_1_pn, index, pull_direction, l, self.compute_dtype(0),is_valid)
-                                    if is_valid:
-                                        #_f_post_stream[l] = read_accumulate_date * self.compute_dtype(0.5)
-                                        # HERE DB
-                                        _f_post_stream[l] = self.compute_dtype(0.0)
-
-                            # do non mres post-streaming corrections
-                            _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_stream, True)
-
-                            for l in range(self.velocity_set.q):
-                                wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
-                    # wp.print("stream_coarse")
-
-                loader.declare_kernel(cl_stream_coarse)
-
-            return ll_stream_coarse
-
-        return None, {
-            #"single_step_finest": single_step_finest,
-            "collide_coarse": collide_coarse,
-            "stream_coarse": stream_coarse}
-
-    def get_containers(self, target_level,  f_0, f_1, bc_mask, missing_mask,  omega, timestep):
-        containers = {'even': {}, 'odd': {}}
-        _, container = self._construct_neon()
-        for key in container.keys():
-            containers['odd'][key] = container[key](target_level, f_1, f_0, bc_mask, missing_mask, omega, 1)
-            containers['even'][key] = container[key](target_level, f_0, f_1, bc_mask, missing_mask, omega, 0)
-        return containers
-
-    def init_containers(self):
-        self.containers=None
-        _, self.containers = self._construct_neon()
-
-    def launch_container(self, streamId, op_name, mres_level, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
-        self.containers[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep).run(0)
-
-    def add_to_app(self, app:typing.List, op_name, mres_level, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
-        app.append(self.containers[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep))
-
-    @Operator.register_backend(ComputeBackend.NEON)
-    def neon_launch(self, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
-        #if self.c is None:
-        #    self.c = self.neon_container(f_0, f_1, bc_mask, missing_mask, timestep)
-        # c = None
-        # if self.odd_or_even == 'even':
-        #     c = self.c_even
-        # else:
-        #     c = self.c_odd
-        #
-        # if c is None:
-        #     pass
-        c = self.neon_container(f_0, f_1, bc_mask, missing_mask, omega, timestep)
-        c.run(0)
-        #
-        # if self.odd_or_even == 'even':
-        #     c = self.c_even
-        # else:
-        #     c = self.c_odd
-        #
-        # if self.odd_or_even == 'even':
-        #     self.odd_or_even = 'odd'
-
-        return f_0, f_1

From e6d62227342e42b7339ac50a75931ecce491745d Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Mon, 19 May 2025 10:56:40 +0200
Subject: [PATCH 037/208] Fusion

---
 xlb/helper/nse_multires_solver.py            | 26 ++++++++++----------
 xlb/operator/stepper/nse_multires_stepper.py |  9 ++++++-
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/xlb/helper/nse_multires_solver.py b/xlb/helper/nse_multires_solver.py
index 13bf71fe..c1269282 100644
--- a/xlb/helper/nse_multires_solver.py
+++ b/xlb/helper/nse_multires_solver.py
@@ -156,19 +156,19 @@ def recurtion(level, app):
             #     timestep=0,
             # )
 
-            print(f"RECURTION Level {level}, stream_coarse_step_C")
-
-            self.stepper.add_to_app(
-                app=app,
-                op_name="stream_coarse_step_C",
-                mres_level=level,
-                f_0=self.f_1,
-                f_1=self.f_0,
-                bc_mask=self.bc_mask,
-                missing_mask=self.missing_mask,
-                omega=self.omega,
-                timestep=0,
-            )
+            # print(f"RECURTION Level {level}, stream_coarse_step_C")
+            #
+            # self.stepper.add_to_app(
+            #     app=app,
+            #     op_name="stream_coarse_step_C",
+            #     mres_level=level,
+            #     f_0=self.f_1,
+            #     f_1=self.f_0,
+            #     bc_mask=self.bc_mask,
+            #     missing_mask=self.missing_mask,
+            #     omega=self.omega,
+            #     timestep=0,
+            # )
             # if(level == 1):
             #     wp.synchronize()
             #     self.f_0.update_host(0)
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 36207d79..45db0e5e 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -521,7 +521,14 @@ def cl_stream_coarse(index: typing.Any):
 
 
 
-
+                    # do non mres post-streaming corrections
+                    _f_post_stream = apply_bc(
+                        index, timestep,
+                        _boundary_id,
+                        _missing_mask,
+                        f_0_pn, f_1_pn,
+                        _f_post_collision, _f_post_stream, True
+                    )
 
 
 

From 694ed06b759a5e62ea0b1db1f9544fd13209c21b Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Sun, 18 May 2025 21:52:45 -0400
Subject: [PATCH 038/208] clean up and removing deprecated odd/even approach

---
 .../3_levels_mlups_3d_multires_solver.py      | 108 ++++++----
 examples/performance/mlups_3d.py              |   7 +-
 .../performance/mlups_3d_multires_solver.py   |  75 ++++---
 .../mlups_3d_multires_solver_single_level.py  |  65 +++---
 examples/performance/mlups_3d_neon.py         |  61 +++---
 examples/performance/mlups_3d_neon_sovler.py  |  38 ++--
 xlb/default_config.py                         |   9 +-
 xlb/grid/grid.py                              |  43 ++--
 xlb/grid/multires_grid.py                     |  64 +++---
 xlb/grid/neon_grid.py                         |  59 ++----
 xlb/helper/initializers.py                    |  16 +-
 xlb/helper/nse_multires_solver.py             |  22 +-
 xlb/helper/nse_solver.py                      |  71 +------
 .../bc_fullway_bounce_back.py                 |   4 +-
 .../bc_halfway_bounce_back.py                 |  27 +--
 .../indices_boundary_masker.py                |  28 +--
 .../mulltires_quadratic_equilibrium.py        |  20 +-
 .../equilibrium/quadratic_equilibrium.py      |  22 +-
 xlb/operator/macroscopic/first_moment.py      |   4 +-
 xlb/operator/macroscopic/macroscopic.py       |  53 +++--
 .../macroscopic/multires_macroscopic.py       |  38 +---
 xlb/operator/macroscopic/zero_moment.py       |   4 +-
 xlb/operator/operator.py                      |   4 +-
 xlb/operator/stepper/nse_multires_stepper.py  | 193 +++++++-----------
 .../stepper/nse_multires_stepper_vk.py        | 114 ++++++-----
 xlb/operator/stepper/nse_stepper.py           |  68 +++---
 xlb/operator/stream/stream.py                 |   8 +-
 27 files changed, 535 insertions(+), 690 deletions(-)

diff --git a/examples/performance/3_levels_mlups_3d_multires_solver.py b/examples/performance/3_levels_mlups_3d_multires_solver.py
index e48a6f4e..394a9092 100644
--- a/examples/performance/3_levels_mlups_3d_multires_solver.py
+++ b/examples/performance/3_levels_mlups_3d_multires_solver.py
@@ -6,6 +6,7 @@
 
 # add a directory to the PYTHON PATH
 import sys
+
 # sys.path.append('/home/max/repos/neon/warping/neon_warp_testing/neon_py_bindings/py/')
 import neon
 
@@ -16,29 +17,48 @@
 from xlb.operator.boundary_condition import FullwayBounceBackBC, EquilibriumBC
 from xlb.distribute import distribute
 
+
+import time
+import numpy as np
+import vtk
+from vtk.util.numpy_support import numpy_to_vtk, numpy_to_vtkIdTypeArray
+from pathlib import Path
+import open3d as o3d
+from tabulate import tabulate
+import h5py
+import cupy as cp  # Added for GPU-accelerated array operations
+
+# Import and initialize NVIDIA Warp for GPU acceleration
+import warp as wp
+
+wp.init()
+DEVICE = "cuda"
+
+
 def parse_arguments():
     parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation (BGK)")
     # Positional arguments
     parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid")
     parser.add_argument("num_steps", type=int, help="Timestep for the simulation")
-    parser.add_argument("backend", type=str, help="Backend for the simulation (jax, warp or neon)")
+    parser.add_argument("compute_backend", type=str, help="Backend for the simulation (jax, warp or neon)")
     parser.add_argument("precision", type=str, help="Precision for the simulation (e.g., fp32/fp32)")
 
     # Optional arguments
     parser.add_argument("--num_devices", type=int, default=0, help="Number of devices for the simulation (default: 0)")
-    parser.add_argument("--velocity_set", type=str, default='D3Q19',
-                        help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)"
-                        )
+    parser.add_argument("--velocity_set", type=str, default="D3Q19", help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)")
 
     return parser.parse_args()
 
 
 def setup_simulation(args):
-    backend = None
-    if args.backend == "jax": backend = ComputeBackend.JAX
-    elif args.backend == "warp": backend = ComputeBackend.WARP
-    elif args.backend == "neon": backend = ComputeBackend.NEON
-    if backend is None:
+    compute_backend = None
+    if args.compute_backend == "jax":
+        compute_backend = ComputeBackend.JAX
+    elif args.compute_backend == "warp":
+        compute_backend = ComputeBackend.WARP
+    elif args.compute_backend == "neon":
+        compute_backend = ComputeBackend.NEON
+    if compute_backend is None:
         raise ValueError("Invalid backend")
 
     precision_policy_map = {
@@ -52,29 +72,31 @@ def setup_simulation(args):
         raise ValueError("Invalid precision")
 
     velocity_set = None
-    if args.velocity_set == 'D3Q19': velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
-    elif args.velocity_set == 'D3Q27': velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, backend=backend)
+    if args.velocity_set == "D3Q19":
+        velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=compute_backend)
+    elif args.velocity_set == "D3Q27":
+        velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=compute_backend)
     if velocity_set is None:
         raise ValueError("Invalid velocity set")
 
     xlb.init(
         velocity_set=velocity_set,
-        default_backend=backend,
+        default_backend=compute_backend,
         default_precision_policy=precision_policy,
     )
 
-    return backend, precision_policy
+    return compute_backend, precision_policy
 
 
-def run(backend, precision_policy, grid_shape, num_steps):
+def run(compute_backend, precision_policy, grid_shape, num_steps):
     # Create grid and setup boundary conditions
-    velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
+    velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=compute_backend)
 
     def peel(dim, idx, peel_level, outwards):
         if outwards:
-            xIn =  idx.x <= peel_level or idx.x >= dim.x -1 -peel_level
-            yIn =  idx.y <= peel_level or idx.y >= dim.y -1 -peel_level
-            zIn =  idx.z <= peel_level or idx.z >= dim.z -1 - peel_level
+            xIn = idx.x <= peel_level or idx.x >= dim.x - 1 - peel_level
+            yIn = idx.y <= peel_level or idx.y >= dim.y - 1 - peel_level
+            zIn = idx.z <= peel_level or idx.z >= dim.z - 1 - peel_level
             return xIn or yIn or zIn
         else:
             xIn = idx.x >= peel_level and idx.x <= dim.x - 1 - peel_level
@@ -82,14 +104,11 @@ def peel(dim, idx, peel_level, outwards):
             zIn = idx.z >= peel_level and idx.z <= dim.z - 1 - peel_level
             return xIn and yIn and zIn
 
-
-    dim = neon.Index_3d(grid_shape[0],
-                        grid_shape[1],
-                        grid_shape[2])
+    dim = neon.Index_3d(grid_shape[0], grid_shape[1], grid_shape[2])
 
     def get_peeled_np(level, width):
         divider = 2**level
-        m = neon.Index_3d(dim.x // divider , dim.y // divider, dim.z // divider)
+        m = neon.Index_3d(dim.x // divider, dim.y // divider, dim.z // divider)
         if level == 0:
             m = dim
 
@@ -113,17 +132,19 @@ def get_peeled_np(level, width):
     l2 = get_peeled_np(2, 4)
 
     num_levels = 4
-    lastLevel = num_levels -1
+    lastLevel = num_levels - 1
     divider = 2**lastLevel
-    m = neon.Index_3d(dim.x // divider +1, dim.y // divider+1, dim.z // divider+1)
+    m = neon.Index_3d(dim.x // divider + 1, dim.y // divider + 1, dim.z // divider + 1)
     lastLevel = np.ones((m.x, m.y, m.z), dtype=int)
     lastLevel = np.ascontiguousarray(lastLevel, dtype=np.int32)
 
     levels = [l0, l1, l2, lastLevel]
-
-    grid = multires_grid_factory(grid_shape, velocity_set=velocity_set,
-                                 sparsity_pattern_list=levels,
-                                 sparsity_pattern_origins=[ neon.Index_3d(0, 0, 0)]*len(levels),)
+    grid = multires_grid_factory(
+        grid_shape,
+        velocity_set=velocity_set,
+        sparsity_pattern_list=levels,
+        sparsity_pattern_origins=[neon.Index_3d(0, 0, 0)] * len(levels),
+    )
 
     box = grid.bounding_box_indices()
     box_no_edge = grid.bounding_box_indices(remove_edges=True)
@@ -133,8 +154,10 @@ def get_peeled_np(level, width):
 
     prescribed_vel = 0.1
 
-    boundary_conditions = [EquilibriumBC(rho=1.0, u=(prescribed_vel, 0.0, 0.0), indices=lid),
-                           EquilibriumBC(rho=1.0, u=(0.0, 0.0, 0.0), indices=walls)]
+    boundary_conditions = [
+        EquilibriumBC(rho=1.0, u=(prescribed_vel, 0.0, 0.0), indices=lid),
+        EquilibriumBC(rho=1.0, u=(0.0, 0.0, 0.0), indices=walls),
+    ]
 
     # Create stepper
     stepper = MultiresIncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="BGK")
@@ -144,7 +167,7 @@ def get_peeled_np(level, width):
     clength = grid_shape[0] - 1
     visc = prescribed_vel * clength / Re
     omega = 1.0 / (3.0 * visc + 0.5)
-    #omega = 1.0
+    # omega = 1.0
 
     sim = xlb.helper.Nse_multires_simulation(grid, velocity_set, stepper, omega)
 
@@ -156,23 +179,22 @@ def get_peeled_np(level, width):
     start_time = time.time()
     for i in range(num_steps):
         sim.step()
-        if i%100 == 0:
+        if i % 1000 == 0:
             print(f"step {i}")
-        #    sim.export_macroscopic("u_lid_driven_cavity_")
+            sim.export_macroscopic("u_lid_driven_cavity_")
     wp.synchronize()
     t = time.time() - start_time
     print(f"Timing  {t}")
 
-    sim.export_macroscopic("u_lid_driven_cavity_")
-    return {"time":t, "num_levels":num_levels}
+    # sim.export_macroscopic("u_lid_driven_cavity_")
+    return {"time": t, "num_levels": num_levels}
 
 
 def calculate_mlups(cube_edge, num_steps, elapsed_time, num_levels):
-    num_step_finer = num_steps * 2**(num_levels-1)
+    num_step_finer = num_steps * 2 ** (num_levels - 1)
     total_lattice_updates = cube_edge**3 * num_step_finer
     mlups = (total_lattice_updates / elapsed_time) / 1e6
-    return {"EMLUPS":mlups, "finer_steps":num_step_finer}
-
+    return {"EMLUPS": mlups, "finer_steps": num_step_finer}
 
     # # remove boundary cells
     # rho = rho[:, 1:-1, 1:-1, 1:-1]
@@ -186,13 +208,13 @@ def calculate_mlups(cube_edge, num_steps, elapsed_time, num_levels):
     # from xlb.utils import  save_image
     # save_image(fields["u_magnitude"][:, ny//2, :], timestep=i, prefix="lid_driven_cavity")
 
-def main():
 
+def main():
     args = parse_arguments()
-    backend, precision_policy = setup_simulation(args)
+    compute_backend, precision_policy = setup_simulation(args)
     grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
-    stats = run(backend, precision_policy, grid_shape, args.num_steps)
-    mlups_stats = calculate_mlups(args.cube_edge, args.num_steps, stats['time'], stats['num_levels'])
+    stats = run(compute_backend, precision_policy, grid_shape, args.num_steps)
+    mlups_stats = calculate_mlups(args.cube_edge, args.num_steps, stats["time"], stats["num_levels"])
 
     print(f"Simulation completed in {stats['time']:.2f} seconds")
     print(f"Number of levels {stats['num_levels']}")
diff --git a/examples/performance/mlups_3d.py b/examples/performance/mlups_3d.py
index 409a8d59..eca222df 100644
--- a/examples/performance/mlups_3d.py
+++ b/examples/performance/mlups_3d.py
@@ -23,7 +23,12 @@ def parse_arguments():
 
 
 def setup_simulation(args):
-    compute_backend = ComputeBackend.JAX if args.compute_backend == "jax" else ComputeBackend.WARP
+    if args.compute_backend == "jax":
+        compute_backend = ComputeBackend.JAX
+    elif args.compute_backend == "warp":
+        compute_backend = ComputeBackend.WARP
+    elif args.compute_backend == "neon":
+        compute_backend = ComputeBackend.NEON
     precision_policy_map = {
         "fp32/fp32": PrecisionPolicy.FP32FP32,
         "fp64/fp64": PrecisionPolicy.FP64FP64,
diff --git a/examples/performance/mlups_3d_multires_solver.py b/examples/performance/mlups_3d_multires_solver.py
index 1dee395a..3b555ea7 100644
--- a/examples/performance/mlups_3d_multires_solver.py
+++ b/examples/performance/mlups_3d_multires_solver.py
@@ -6,6 +6,7 @@
 
 # add a directory to the PYTHON PATH
 import sys
+
 # sys.path.append('/home/max/repos/neon/warping/neon_warp_testing/neon_py_bindings/py/')
 import neon
 
@@ -16,6 +17,7 @@
 from xlb.operator.boundary_condition import FullwayBounceBackBC, EquilibriumBC
 from xlb.distribute import distribute
 
+
 def parse_arguments():
     parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation (BGK)")
     # Positional arguments
@@ -26,18 +28,19 @@ def parse_arguments():
 
     # Optional arguments
     parser.add_argument("--num_devices", type=int, default=0, help="Number of devices for the simulation (default: 0)")
-    parser.add_argument("--velocity_set", type=str, default='D3Q19',
-                        help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)"
-                        )
+    parser.add_argument("--velocity_set", type=str, default="D3Q19", help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)")
 
     return parser.parse_args()
 
 
 def setup_simulation(args):
     backend = None
-    if args.backend == "jax": backend = ComputeBackend.JAX
-    elif args.backend == "warp": backend = ComputeBackend.WARP
-    elif args.backend == "neon": backend = ComputeBackend.NEON
+    if args.backend == "jax":
+        backend = ComputeBackend.JAX
+    elif args.backend == "warp":
+        backend = ComputeBackend.WARP
+    elif args.backend == "neon":
+        backend = ComputeBackend.NEON
     if backend is None:
         raise ValueError("Invalid backend")
 
@@ -52,8 +55,10 @@ def setup_simulation(args):
         raise ValueError("Invalid precision")
 
     velocity_set = None
-    if args.velocity_set == 'D3Q19': velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
-    elif args.velocity_set == 'D3Q27': velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, backend=backend)
+    if args.velocity_set == "D3Q19":
+        velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
+    elif args.velocity_set == "D3Q27":
+        velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, backend=backend)
     if velocity_set is None:
         raise ValueError("Invalid velocity set")
 
@@ -72,9 +77,9 @@ def run(backend, precision_policy, grid_shape, num_steps):
 
     def peel(dim, idx, peel_level, outwards):
         if outwards:
-            xIn =  idx.x <= peel_level or idx.x >= dim.x -1 -peel_level
-            yIn =  idx.y <= peel_level or idx.y >= dim.y -1 -peel_level
-            zIn =  idx.z <= peel_level or idx.z >= dim.z -1 - peel_level
+            xIn = idx.x <= peel_level or idx.x >= dim.x - 1 - peel_level
+            yIn = idx.y <= peel_level or idx.y >= dim.y - 1 - peel_level
+            zIn = idx.z <= peel_level or idx.z >= dim.z - 1 - peel_level
             return xIn or yIn or zIn
         else:
             xIn = idx.x >= peel_level and idx.x <= dim.x - 1 - peel_level
@@ -82,38 +87,42 @@ def peel(dim, idx, peel_level, outwards):
             zIn = idx.z >= peel_level and idx.z <= dim.z - 1 - peel_level
             return xIn and yIn and zIn
 
-
-    dim = neon.Index_3d(grid_shape[0],
-                        grid_shape[1],
-                        grid_shape[2])
+    dim = neon.Index_3d(grid_shape[0], grid_shape[1], grid_shape[2])
     level_zero_mask = np.zeros((dim.x, dim.y, dim.z), dtype=int)
     level_zero_mask = np.ascontiguousarray(level_zero_mask, dtype=np.int32)
     # loop over all the elements in level_zero_mask and set to one any that have x=0 or y=0 or z=0
     for i in range(dim.x):
         for j in range(dim.y):
             for k in range(dim.z):
-                idx = neon.Index_3d(i,j,k)
+                idx = neon.Index_3d(i, j, k)
                 val = 0
-                if peel(dim, idx, dim.x/9, True):
+                if peel(dim, idx, dim.x / 9, True):
                     val = 1
                 level_zero_mask[i, j, k] = val
 
-
     m = neon.Index_3d(dim.x // 2, dim.y // 2, dim.z // 2)
     level_one_mask = np.ones((m.x, m.y, m.z), dtype=int)
     for i in range(m.x):
         for j in range(m.x):
             for k in range(m.x):
-                idx = neon.Index_3d(i,j,k)
+                idx = neon.Index_3d(i, j, k)
                 val = 1
                 level_one_mask[i, j, k] = val
 
     level_one_mask = np.ascontiguousarray(level_one_mask, dtype=np.int32)
 
-    grid = multires_grid_factory(grid_shape, velocity_set=velocity_set,
-                                 sparsity_pattern_list=[ level_zero_mask,level_one_mask ,],
-                                 sparsity_pattern_origins=[ neon.Index_3d(0, 0, 0),
-                                                            neon.Index_3d(0, 0, 0),])
+    grid = multires_grid_factory(
+        grid_shape,
+        velocity_set=velocity_set,
+        sparsity_pattern_list=[
+            level_zero_mask,
+            level_one_mask,
+        ],
+        sparsity_pattern_origins=[
+            neon.Index_3d(0, 0, 0),
+            neon.Index_3d(0, 0, 0),
+        ],
+    )
 
     box = grid.bounding_box_indices()
     box_no_edge = grid.bounding_box_indices(remove_edges=True)
@@ -123,8 +132,10 @@ def peel(dim, idx, peel_level, outwards):
 
     prescribed_vel = 0.05
 
-    boundary_conditions = [EquilibriumBC(rho=1.0, u=(prescribed_vel, 0.0, 0.0), indices=lid),
-                           EquilibriumBC(rho=1.0, u=(0.0, 0.0, 0.0), indices=walls)]
+    boundary_conditions = [
+        EquilibriumBC(rho=1.0, u=(prescribed_vel, 0.0, 0.0), indices=lid),
+        EquilibriumBC(rho=1.0, u=(0.0, 0.0, 0.0), indices=walls),
+    ]
 
     # Create stepper
     stepper = MultiresIncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="BGK")
@@ -136,7 +147,6 @@ def peel(dim, idx, peel_level, outwards):
     omega = 1.0 / (3.0 * visc + 0.5)
     omega = 1.0
 
-
     # # Initialize fields and run simulation
     # omega = 1.0
 
@@ -150,7 +160,7 @@ def peel(dim, idx, peel_level, outwards):
     for i in range(num_steps):
         print(f"step {i}")
         sim.step()
-        if i%10 == 0:
+        if i % 10 == 0:
             sim.export_macroscopic("u_lid_driven_cavity_")
     wp.synchronize()
     t = time.time() - start_time
@@ -164,7 +174,8 @@ def calculate_mlups(cube_edge, num_steps, elapsed_time):
     mlups = (total_lattice_updates / elapsed_time) / 1e6
     return mlups
 
-def post_process(macro, rho, u, f_0,  i):
+
+def post_process(macro, rho, u, f_0, i):
     # Write the results. We'll use JAX backend for the post-processing
     # import jax.numpy as jnp
     # if not isinstance(f_0, jnp.ndarray):
@@ -172,13 +183,13 @@ def post_process(macro, rho, u, f_0,  i):
     #     f_0 = wp.to_jax(f_0)[..., 0]
     # else:
     #     f_0 = f_0
-    rho, u = macro(f_0, rho, u )
+    rho, u = macro(f_0, rho, u)
     wp.synchronize()
     u.update_host(0)
     rho.update_host(0)
     wp.synchronize()
-    u.export_vti(f"u_lid_driven_cavity_{i}.vti", 'u')
-    rho.export_vti(f"rho_lid_driven_cavity_{i}.vti", 'rho')
+    u.export_vti(f"u_lid_driven_cavity_{i}.vti", "u")
+    rho.export_vti(f"rho_lid_driven_cavity_{i}.vti", "rho")
 
     pass
 
@@ -194,8 +205,8 @@ def post_process(macro, rho, u, f_0,  i):
     # from xlb.utils import  save_image
     # save_image(fields["u_magnitude"][:, ny//2, :], timestep=i, prefix="lid_driven_cavity")
 
-def main():
 
+def main():
     args = parse_arguments()
     backend, precision_policy = setup_simulation(args)
     grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
diff --git a/examples/performance/mlups_3d_multires_solver_single_level.py b/examples/performance/mlups_3d_multires_solver_single_level.py
index 3d7aaee8..ebb3c7df 100644
--- a/examples/performance/mlups_3d_multires_solver_single_level.py
+++ b/examples/performance/mlups_3d_multires_solver_single_level.py
@@ -6,6 +6,7 @@
 
 # add a directory to the PYTHON PATH
 import sys
+
 # sys.path.append('/home/max/repos/neon/warping/neon_warp_testing/neon_py_bindings/py/')
 import neon
 
@@ -16,6 +17,7 @@
 from xlb.operator.boundary_condition import FullwayBounceBackBC, EquilibriumBC
 from xlb.distribute import distribute
 
+
 def parse_arguments():
     parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation (BGK)")
     # Positional arguments
@@ -26,18 +28,19 @@ def parse_arguments():
 
     # Optional arguments
     parser.add_argument("--num_devices", type=int, default=0, help="Number of devices for the simulation (default: 0)")
-    parser.add_argument("--velocity_set", type=str, default='D3Q19',
-                        help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)"
-                        )
+    parser.add_argument("--velocity_set", type=str, default="D3Q19", help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)")
 
     return parser.parse_args()
 
 
 def setup_simulation(args):
     backend = None
-    if args.backend == "jax": backend = ComputeBackend.JAX
-    elif args.backend == "warp": backend = ComputeBackend.WARP
-    elif args.backend == "neon": backend = ComputeBackend.NEON
+    if args.backend == "jax":
+        backend = ComputeBackend.JAX
+    elif args.backend == "warp":
+        backend = ComputeBackend.WARP
+    elif args.backend == "neon":
+        backend = ComputeBackend.NEON
     if backend is None:
         raise ValueError("Invalid backend")
 
@@ -52,8 +55,10 @@ def setup_simulation(args):
         raise ValueError("Invalid precision")
 
     velocity_set = None
-    if args.velocity_set == 'D3Q19': velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
-    elif args.velocity_set == 'D3Q27': velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, backend=backend)
+    if args.velocity_set == "D3Q19":
+        velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
+    elif args.velocity_set == "D3Q27":
+        velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, backend=backend)
     if velocity_set is None:
         raise ValueError("Invalid velocity set")
 
@@ -72,9 +77,9 @@ def run(backend, precision_policy, grid_shape, num_steps):
 
     def peel(dim, idx, peel_level, outwards):
         if outwards:
-            xIn =  idx.x <= peel_level or idx.x >= dim.x -1 -peel_level
-            yIn =  idx.y <= peel_level or idx.y >= dim.y -1 -peel_level
-            zIn =  idx.z <= peel_level or idx.z >= dim.z -1 - peel_level
+            xIn = idx.x <= peel_level or idx.x >= dim.x - 1 - peel_level
+            yIn = idx.y <= peel_level or idx.y >= dim.y - 1 - peel_level
+            zIn = idx.z <= peel_level or idx.z >= dim.z - 1 - peel_level
             return xIn or yIn or zIn
         else:
             xIn = idx.x >= peel_level and idx.x <= dim.x - 1 - peel_level
@@ -82,17 +87,20 @@ def peel(dim, idx, peel_level, outwards):
             zIn = idx.z >= peel_level and idx.z <= dim.z - 1 - peel_level
             return xIn and yIn and zIn
 
-
-    dim = neon.Index_3d(grid_shape[0],
-                        grid_shape[1],
-                        grid_shape[2])
+    dim = neon.Index_3d(grid_shape[0], grid_shape[1], grid_shape[2])
     level_zero_mask = np.ones((dim.x, dim.y, dim.z), dtype=int)
     level_zero_mask = np.ascontiguousarray(level_zero_mask, dtype=np.int32)
 
-
-    grid = multires_grid_factory(grid_shape, velocity_set=velocity_set,
-                                 sparsity_pattern_list=[ level_zero_mask,],
-                                 sparsity_pattern_origins=[ neon.Index_3d(0, 0, 0),])
+    grid = multires_grid_factory(
+        grid_shape,
+        velocity_set=velocity_set,
+        sparsity_pattern_list=[
+            level_zero_mask,
+        ],
+        sparsity_pattern_origins=[
+            neon.Index_3d(0, 0, 0),
+        ],
+    )
 
     box = grid.bounding_box_indices()
     box_no_edge = grid.bounding_box_indices(remove_edges=True)
@@ -102,8 +110,10 @@ def peel(dim, idx, peel_level, outwards):
 
     prescribed_vel = 0.05
 
-    boundary_conditions = [EquilibriumBC(rho=1.0, u=(prescribed_vel, 0.0, 0.0), indices=lid),
-                           EquilibriumBC(rho=1.0, u=(0.0, 0.0, 0.0), indices=walls)]
+    boundary_conditions = [
+        EquilibriumBC(rho=1.0, u=(prescribed_vel, 0.0, 0.0), indices=lid),
+        EquilibriumBC(rho=1.0, u=(0.0, 0.0, 0.0), indices=walls),
+    ]
 
     # Create stepper
     stepper = MultiresIncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="BGK")
@@ -126,7 +136,7 @@ def peel(dim, idx, peel_level, outwards):
     for i in range(num_steps):
         print(f"step {i}")
         sim.step()
-        if i%1 == 0:
+        if i % 1 == 0:
             sim.export_macroscopic("u_lid_driven_cavity_")
     wp.synchronize()
     t = time.time() - start_time
@@ -140,7 +150,8 @@ def calculate_mlups(cube_edge, num_steps, elapsed_time):
     mlups = (total_lattice_updates / elapsed_time) / 1e6
     return mlups
 
-def post_process(macro, rho, u, f_0,  i):
+
+def post_process(macro, rho, u, f_0, i):
     # Write the results. We'll use JAX backend for the post-processing
     # import jax.numpy as jnp
     # if not isinstance(f_0, jnp.ndarray):
@@ -148,13 +159,13 @@ def post_process(macro, rho, u, f_0,  i):
     #     f_0 = wp.to_jax(f_0)[..., 0]
     # else:
     #     f_0 = f_0
-    rho, u = macro(f_0, rho, u )
+    rho, u = macro(f_0, rho, u)
     wp.synchronize()
     u.update_host(0)
     rho.update_host(0)
     wp.synchronize()
-    u.export_vti(f"u_lid_driven_cavity_{i}.vti", 'u')
-    rho.export_vti(f"rho_lid_driven_cavity_{i}.vti", 'rho')
+    u.export_vti(f"u_lid_driven_cavity_{i}.vti", "u")
+    rho.export_vti(f"rho_lid_driven_cavity_{i}.vti", "rho")
 
     pass
 
@@ -170,8 +181,8 @@ def post_process(macro, rho, u, f_0,  i):
     # from xlb.utils import  save_image
     # save_image(fields["u_magnitude"][:, ny//2, :], timestep=i, prefix="lid_driven_cavity")
 
-def main():
 
+def main():
     args = parse_arguments()
     backend, precision_policy = setup_simulation(args)
     grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
diff --git a/examples/performance/mlups_3d_neon.py b/examples/performance/mlups_3d_neon.py
index 30e20df3..4496287e 100644
--- a/examples/performance/mlups_3d_neon.py
+++ b/examples/performance/mlups_3d_neon.py
@@ -8,7 +8,8 @@
 
 # add a directory to the PYTHON PATH
 import sys
-sys.path.append('/home/max/repos/neon/warping/neon_warp_testing/neon_py_bindings/py/')
+
+sys.path.append("/home/max/repos/neon/warping/neon_warp_testing/neon_py_bindings/py/")
 import neon
 
 from xlb.compute_backend import ComputeBackend
@@ -24,24 +25,25 @@ def parse_arguments():
     # Positional arguments
     parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid")
     parser.add_argument("num_steps", type=int, help="Timestep for the simulation")
-    parser.add_argument("backend", type=str, help="Backend for the simulation (jax, warp or neon)")
+    parser.add_argument("compute_backend", type=str, help="Backend for the simulation (jax, warp or neon)")
     parser.add_argument("precision", type=str, help="Precision for the simulation (e.g., fp32/fp32)")
 
     # Optional arguments
     parser.add_argument("--num_devices", type=int, default=0, help="Number of devices for the simulation (default: 0)")
-    parser.add_argument("--velocity_set", type=str, default='D3Q19',
-                        help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)"
-                        )
+    parser.add_argument("--velocity_set", type=str, default="D3Q19", help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)")
 
     return parser.parse_args()
 
 
 def setup_simulation(args):
-    backend = None
-    if args.backend == "jax": backend = ComputeBackend.JAX
-    elif args.backend == "warp": backend = ComputeBackend.WARP
-    elif args.backend == "neon": backend = ComputeBackend.NEON
-    if backend is None:
+    compute_backend = None
+    if args.compute_backend == "jax":
+        compute_backend = ComputeBackend.JAX
+    elif args.compute_backend == "warp":
+        compute_backend = ComputeBackend.WARP
+    elif args.compute_backend == "neon":
+        compute_backend = ComputeBackend.NEON
+    if compute_backend is None:
         raise ValueError("Invalid backend")
 
     precision_policy_map = {
@@ -55,23 +57,25 @@ def setup_simulation(args):
         raise ValueError("Invalid precision")
 
     velocity_set = None
-    if args.velocity_set == 'D3Q19': velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
-    elif args.velocity_set == 'D3Q27': velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, backend=backend)
+    if args.velocity_set == "D3Q19":
+        velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=compute_backend)
+    elif args.velocity_set == "D3Q27":
+        velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=compute_backend)
     if velocity_set is None:
         raise ValueError("Invalid velocity set")
 
     xlb.init(
         velocity_set=velocity_set,
-        default_backend=backend,
+        default_backend=compute_backend,
         default_precision_policy=precision_policy,
     )
 
-    return backend, precision_policy
+    return compute_backend, precision_policy
 
 
-def run(macro, backend, precision_policy, grid_shape, num_steps):
+def run(macro, compute_backend, precision_policy, grid_shape, num_steps):
     # Create grid and setup boundary conditions
-    velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
+    velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=compute_backend)
     grid = grid_factory(grid_shape, velocity_set=velocity_set)
     box = grid.bounding_box_indices()
     box_no_edge = grid.bounding_box_indices(remove_edges=True)
@@ -87,8 +91,8 @@ def run(macro, backend, precision_policy, grid_shape, num_steps):
     # Initialize fields and run simulation
     omega = 1.0
     f_0, f_1, bc_mask, missing_mask = stepper.prepare_fields()
-    rho  = stepper.grid.create_field(1, dtype=precision_policy.store_precision)
-    u  = stepper.grid.create_field(3, dtype=precision_policy.store_precision)
+    rho = stepper.grid.create_field(1, dtype=precision_policy.store_precision)
+    u = stepper.grid.create_field(3, dtype=precision_policy.store_precision)
 
     start_time = time.time()
 
@@ -96,7 +100,7 @@ def run(macro, backend, precision_policy, grid_shape, num_steps):
         f_0, f_1 = stepper(f_0, f_1, bc_mask, missing_mask, omega, i)
         f_0, f_1 = f_1, f_0
 
-    #if i % 2 == 0 or i == num_steps - 1:
+        # if i % 2 == 0 or i == num_steps - 1:
         wp.synchronize()
         post_process(macro, rho, u, f_0, i)
     wp.synchronize()
@@ -109,7 +113,8 @@ def calculate_mlups(cube_edge, num_steps, elapsed_time):
     mlups = (total_lattice_updates / elapsed_time) / 1e6
     return mlups
 
-def post_process(macro, rho, u, f_0,  i):
+
+def post_process(macro, rho, u, f_0, i):
     # Write the results. We'll use JAX backend for the post-processing
     # import jax.numpy as jnp
     # if not isinstance(f_0, jnp.ndarray):
@@ -117,13 +122,13 @@ def post_process(macro, rho, u, f_0,  i):
     #     f_0 = wp.to_jax(f_0)[..., 0]
     # else:
     #     f_0 = f_0
-    rho, u = macro(f_0, rho, u )
+    rho, u = macro(f_0, rho, u)
     wp.synchronize()
     u.update_host(0)
     rho.update_host(0)
     wp.synchronize()
-    u.export_vti(f"u_lid_driven_cavity_{i}.vti", 'u')
-    rho.export_vti(f"rho_lid_driven_cavity_{i}.vti", 'rho')
+    u.export_vti(f"u_lid_driven_cavity_{i}.vti", "u")
+    rho.export_vti(f"rho_lid_driven_cavity_{i}.vti", "rho")
 
     pass
 
@@ -139,19 +144,19 @@ def post_process(macro, rho, u, f_0,  i):
     # from xlb.utils import  save_image
     # save_image(fields["u_magnitude"][:, ny//2, :], timestep=i, prefix="lid_driven_cavity")
 
-def main():
-
 
+def main():
     args = parse_arguments()
-    backend, precision_policy = setup_simulation(args)
+    compute_backend, precision_policy = setup_simulation(args)
     grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
     from xlb.operator.macroscopic import Macroscopic
+
     macro = Macroscopic(
         compute_backend=ComputeBackend.NEON,
         precision_policy=precision_policy,
-        velocity_set=xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=ComputeBackend.NEON),
+        velocity_set=xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=ComputeBackend.NEON),
     )
-    elapsed_time = run(macro, backend, precision_policy, grid_shape, args.num_steps)
+    elapsed_time = run(macro, compute_backend, precision_policy, grid_shape, args.num_steps)
     mlups = calculate_mlups(args.cube_edge, args.num_steps, elapsed_time)
 
     print(f"Simulation completed in {elapsed_time:.2f} seconds")
diff --git a/examples/performance/mlups_3d_neon_sovler.py b/examples/performance/mlups_3d_neon_sovler.py
index a5c65af2..ff72e366 100644
--- a/examples/performance/mlups_3d_neon_sovler.py
+++ b/examples/performance/mlups_3d_neon_sovler.py
@@ -8,7 +8,8 @@
 
 # add a directory to the PYTHON PATH
 import sys
-sys.path.append('/home/max/repos/neon/warping/neon_warp_testing/neon_py_bindings/py/')
+
+sys.path.append("/home/max/repos/neon/warping/neon_warp_testing/neon_py_bindings/py/")
 import neon
 
 from xlb.compute_backend import ComputeBackend
@@ -18,6 +19,7 @@
 from xlb.operator.boundary_condition import FullwayBounceBackBC, EquilibriumBC
 from xlb.distribute import distribute
 
+
 def parse_arguments():
     parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation (BGK)")
     # Positional arguments
@@ -28,18 +30,19 @@ def parse_arguments():
 
     # Optional arguments
     parser.add_argument("--num_devices", type=int, default=0, help="Number of devices for the simulation (default: 0)")
-    parser.add_argument("--velocity_set", type=str, default='D3Q19',
-                        help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)"
-                        )
+    parser.add_argument("--velocity_set", type=str, default="D3Q19", help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)")
 
     return parser.parse_args()
 
 
 def setup_simulation(args):
     backend = None
-    if args.backend == "jax": backend = ComputeBackend.JAX
-    elif args.backend == "warp": backend = ComputeBackend.WARP
-    elif args.backend == "neon": backend = ComputeBackend.NEON
+    if args.backend == "jax":
+        backend = ComputeBackend.JAX
+    elif args.backend == "warp":
+        backend = ComputeBackend.WARP
+    elif args.backend == "neon":
+        backend = ComputeBackend.NEON
     if backend is None:
         raise ValueError("Invalid backend")
 
@@ -54,8 +57,10 @@ def setup_simulation(args):
         raise ValueError("Invalid precision")
 
     velocity_set = None
-    if args.velocity_set == 'D3Q19': velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
-    elif args.velocity_set == 'D3Q27': velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, backend=backend)
+    if args.velocity_set == "D3Q19":
+        velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
+    elif args.velocity_set == "D3Q27":
+        velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, backend=backend)
     if velocity_set is None:
         raise ValueError("Invalid velocity set")
 
@@ -68,7 +73,7 @@ def setup_simulation(args):
     return backend, precision_policy
 
 
-def run( backend, precision_policy, grid_shape, num_steps):
+def run(backend, precision_policy, grid_shape, num_steps):
     # Create grid and setup boundary conditions
     velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
     grid = grid_factory(grid_shape, velocity_set=velocity_set)
@@ -99,7 +104,7 @@ def run( backend, precision_policy, grid_shape, num_steps):
 
     for i in range(num_steps):
         sim.step()
-        if i%500 == 0:
+        if i % 500 == 0:
             sim.export_macroscopic("u_lid_driven_cavity_")
     wp.synchronize()
     t = time.time() - start_time
@@ -113,7 +118,8 @@ def calculate_mlups(cube_edge, num_steps, elapsed_time):
     mlups = (total_lattice_updates / elapsed_time) / 1e6
     return mlups
 
-def post_process(macro, rho, u, f_0,  i):
+
+def post_process(macro, rho, u, f_0, i):
     # Write the results. We'll use JAX backend for the post-processing
     # import jax.numpy as jnp
     # if not isinstance(f_0, jnp.ndarray):
@@ -121,13 +127,13 @@ def post_process(macro, rho, u, f_0,  i):
     #     f_0 = wp.to_jax(f_0)[..., 0]
     # else:
     #     f_0 = f_0
-    rho, u = macro(f_0, rho, u )
+    rho, u = macro(f_0, rho, u)
     wp.synchronize()
     u.update_host(0)
     rho.update_host(0)
     wp.synchronize()
-    u.export_vti(f"u_lid_driven_cavity_{i}.vti", 'u')
-    rho.export_vti(f"rho_lid_driven_cavity_{i}.vti", 'rho')
+    u.export_vti(f"u_lid_driven_cavity_{i}.vti", "u")
+    rho.export_vti(f"rho_lid_driven_cavity_{i}.vti", "rho")
 
     pass
 
@@ -143,8 +149,8 @@ def post_process(macro, rho, u, f_0,  i):
     # from xlb.utils import  save_image
     # save_image(fields["u_magnitude"][:, ny//2, :], timestep=i, prefix="lid_driven_cavity")
 
-def main():
 
+def main():
     args = parse_arguments()
     backend, precision_policy = setup_simulation(args)
     grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
diff --git a/xlb/default_config.py b/xlb/default_config.py
index ff238297..fc24eb4d 100644
--- a/xlb/default_config.py
+++ b/xlb/default_config.py
@@ -24,14 +24,13 @@ def init(velocity_set, default_backend, default_precision_policy):
         import warp as wp
         import neon
 
-        #wp.config.mode = "release"
-        #wp.config.llvm_cuda = False
-        #wp.config.verbose = True
-        #wp.verbose_warnings = True
+        # wp.config.mode = "release"
+        # wp.config.llvm_cuda = False
+        # wp.config.verbose = True
+        # wp.verbose_warnings = True
 
         wp.init()
 
-
         # It's a good idea to always clear the kernel cache when developing new native or codegen features
         wp.build.clear_kernel_cache()
 
diff --git a/xlb/grid/grid.py b/xlb/grid/grid.py
index 4ab6cfee..d4696dd4 100644
--- a/xlb/grid/grid.py
+++ b/xlb/grid/grid.py
@@ -6,10 +6,12 @@
 from xlb.compute_backend import ComputeBackend
 import neon
 
-def grid_factory(shape: Tuple[int, ...],
-                 compute_backend: ComputeBackend = None,
-                 velocity_set=None,
-                 ):
+
+def grid_factory(
+    shape: Tuple[int, ...],
+    compute_backend: ComputeBackend = None,
+    velocity_set=None,
+):
     compute_backend = compute_backend or DefaultConfig.default_backend
     if compute_backend == ComputeBackend.WARP:
         from xlb.grid.warp_grid import WarpGrid
@@ -18,8 +20,7 @@ def grid_factory(shape: Tuple[int, ...],
     elif compute_backend == ComputeBackend.NEON:
         from xlb.grid.neon_grid import NeonGrid
 
-        return NeonGrid(shape=shape,
-                        velocity_set=velocity_set)
+        return NeonGrid(shape=shape, velocity_set=velocity_set)
     elif compute_backend == ComputeBackend.JAX:
         from xlb.grid.jax_grid import JaxGrid
 
@@ -27,25 +28,26 @@ def grid_factory(shape: Tuple[int, ...],
 
     raise ValueError(f"Compute backend {compute_backend} is not supported")
 
-def multires_grid_factory(shape: Tuple[int, ...],
-                 compute_backend: ComputeBackend = None,
-                 velocity_set=None,
-                 sparsity_pattern_list: List[np.ndarray] = [],
-                 sparsity_pattern_origins: List[neon.Index_3d]=[],
-                 ):
 
+def multires_grid_factory(
+    shape: Tuple[int, ...],
+    compute_backend: ComputeBackend = None,
+    velocity_set=None,
+    sparsity_pattern_list: List[np.ndarray] = [],
+    sparsity_pattern_origins: List[neon.Index_3d] = [],
+):
     compute_backend = compute_backend or DefaultConfig.default_backend
     if compute_backend == ComputeBackend.WARP:
         from xlb.grid.warp_grid import WarpGrid
+
         raise ValueError(f"Compute backend {compute_backend} is not supported for multires grid")
 
     if compute_backend == ComputeBackend.NEON:
         from xlb.grid.multires_grid import NeonMultiresGrid
 
-        return NeonMultiresGrid(shape=shape,
-                                velocity_set=velocity_set,
-                                sparsity_pattern_list = sparsity_pattern_list,
-                                sparsity_pattern_origins=  sparsity_pattern_origins)
+        return NeonMultiresGrid(
+            shape=shape, velocity_set=velocity_set, sparsity_pattern_list=sparsity_pattern_list, sparsity_pattern_origins=sparsity_pattern_origins
+        )
 
     elif compute_backend == ComputeBackend.JAX:
         raise ValueError(f"Compute backend {compute_backend} is not supported for multires grid")
@@ -54,10 +56,11 @@ def multires_grid_factory(shape: Tuple[int, ...],
 
 
 class Grid(ABC):
-    def __init__(self,
-                 shape: Tuple[int, ...],
-                 compute_backend: ComputeBackend,
-                 ):
+    def __init__(
+        self,
+        shape: Tuple[int, ...],
+        compute_backend: ComputeBackend,
+    ):
         self.shape = shape
         self.dim = len(shape)
         self.compute_backend = compute_backend
diff --git a/xlb/grid/multires_grid.py b/xlb/grid/multires_grid.py
index 25c408de..d434fdc9 100644
--- a/xlb/grid/multires_grid.py
+++ b/xlb/grid/multires_grid.py
@@ -9,10 +9,13 @@
 
 
 class NeonMultiresGrid(Grid):
-    def __init__(self, shape,
-                 velocity_set,
-                 sparsity_pattern_list: List[np.ndarray],
-                 sparsity_pattern_origins: List[neon.Index_3d],):
+    def __init__(
+        self,
+        shape,
+        velocity_set,
+        sparsity_pattern_list: List[np.ndarray],
+        sparsity_pattern_origins: List[neon.Index_3d],
+    ):
         from .warp_grid import WarpGrid
 
         self.bk = None
@@ -30,16 +33,14 @@ def _get_velocity_set(self):
         return self.xlb_lattice
 
     def _initialize_backend(self):
-
         # FIXME@max: for now we hardcode the number of devices to 0
         num_devs = 1
         dev_idx_list = list(range(num_devs))
 
         if len(self.shape) == 2:
             import py_neon
-            self.dim = py_neon.Index_3d(self.shape[0],
-                                        1,
-                                        self.shape[1])
+
+            self.dim = py_neon.Index_3d(self.shape[0], 1, self.shape[1])
             self.neon_stencil = []
             for c_idx in range(len(self.xlb_lattice._c[0])):
                 xval = self.xlb_lattice._c[0][c_idx]
@@ -47,9 +48,7 @@ def _initialize_backend(self):
                 self.neon_stencil.append([xval, 0, yval])
 
         else:
-            self.dim = neon.Index_3d(self.shape[0],
-                                        self.shape[1],
-                                        self.shape[2])
+            self.dim = neon.Index_3d(self.shape[0], self.shape[1], self.shape[2])
 
             self.neon_stencil = []
             for c_idx in range(len(self.xlb_lattice._c[0])):
@@ -58,9 +57,7 @@ def _initialize_backend(self):
                 zval = self.xlb_lattice._c[2][c_idx]
                 self.neon_stencil.append([xval, yval, zval])
 
-        self.bk = neon.Backend(
-            runtime=neon.Backend.Runtime.stream,
-            dev_idx_list=dev_idx_list)
+        self.bk = neon.Backend(runtime=neon.Backend.Runtime.stream, dev_idx_list=dev_idx_list)
 
         """
          backend: neon.Backend,
@@ -73,33 +70,33 @@ def _initialize_backend(self):
             dim=self.dim,
             sparsity_pattern_list=self.sparsity_pattern_list,
             sparsity_pattern_origins=self.sparsity_pattern_origins,
-            stencil=self.neon_stencil)
+            stencil=self.neon_stencil,
+        )
         pass
 
     def create_field(
-            self,
-            cardinality: int,
-            dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None,
-            fill_value=None,
+        self,
+        cardinality: int,
+        dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None,
+        fill_value=None,
     ):
         dtype = dtype.wp_dtype if dtype else DefaultConfig.default_precision_policy.store_precision.wp_dtype
-        field = self.grid.new_field(cardinality=cardinality,
-                                    dtype=dtype, )
+        field = self.grid.new_field(
+            cardinality=cardinality,
+            dtype=dtype,
+        )
         for l in range(self.count_levels):
             if fill_value is None:
-                field.zero_run(l, stream_idx = 0)
+                field.zero_run(l, stream_idx=0)
             else:
-                field.fill_run(level= l, value=fill_value,stream_idx = 0)
+                field.fill_run(level=l, value=fill_value, stream_idx=0)
         return field
 
     def get_neon_backend(self):
         return self.bk
 
-    def _create_warp_field(self,
-                           cardinality: int,
-                           dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None,
-                           fill_value=None,
-                           ne_field=None
+    def _create_warp_field(
+        self, cardinality: int, dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None, fill_value=None, ne_field=None
     ):
         print("WARNING: allocating warp fields for mres is temporary and only a work around!")
         warp_field = self.warp_grid.create_field(cardinality, dtype, fill_value)
@@ -109,12 +106,9 @@ def _create_warp_field(self,
         _d = self.xlb_lattice.d
 
         import typing
+
         @neon.Container.factory(mame="cloning-warp")
-        def container(
-                src_field: typing.Any,
-                dst_field: typing.Any,
-                cardinality: wp.int32
-        ):
+        def container(src_field: typing.Any, dst_field: typing.Any, cardinality: wp.int32):
             def loading_step(loader: neon.Loader):
                 loader.declare_execution_scope(self.grid, level=0)
                 src_pn = loader.get_read_handel(src_field)
@@ -131,9 +125,7 @@ def cloning(gridIdx: typing.Any):
                         gy, gz = gz, gy
 
                     for card in range(cardinality):
-                        value = wp.neon_read(src_pn,
-                                      gridIdx,
-                                      card)
+                        value = wp.neon_read(src_pn, gridIdx, card)
                         dst_field[card, gx, gy, gz] = value
 
                 loader.declare_kernel(cloning)
diff --git a/xlb/grid/neon_grid.py b/xlb/grid/neon_grid.py
index d8ce71c9..126ae627 100644
--- a/xlb/grid/neon_grid.py
+++ b/xlb/grid/neon_grid.py
@@ -1,6 +1,4 @@
 import warp as wp
-from cryptography.hazmat.backends.openssl.backend import backend
-
 import neon
 from .grid import Grid
 from xlb.precision_policy import Precision
@@ -8,6 +6,7 @@
 from typing import Literal
 from xlb import DefaultConfig
 
+
 class NeonGrid(Grid):
     def __init__(self, shape, velocity_set):
         from .warp_grid import WarpGrid
@@ -24,16 +23,14 @@ def _get_velocity_set(self):
         return self.xlb_lattice
 
     def _initialize_backend(self):
-
         # FIXME@max: for now we hardcode the number of devices to 0
         num_devs = 1
         dev_idx_list = list(range(num_devs))
 
         if len(self.shape) == 2:
             import py_neon
-            self.dim = py_neon.Index_3d(self.shape[0],
-                                        1,
-                                        self.shape[1])
+
+            self.dim = py_neon.Index_3d(self.shape[0], 1, self.shape[1])
             self.neon_stencil = []
             for c_idx in range(len(self.xlb_lattice._c[0])):
                 xval = self.xlb_lattice._c[0][c_idx]
@@ -41,9 +38,7 @@ def _initialize_backend(self):
                 self.neon_stencil.append([xval, 0, yval])
 
         else:
-            self.dim = neon.Index_3d(self.shape[0],
-                                        self.shape[1],
-                                        self.shape[2])
+            self.dim = neon.Index_3d(self.shape[0], self.shape[1], self.shape[2])
 
             self.neon_stencil = []
             for c_idx in range(len(self.xlb_lattice._c[0])):
@@ -52,38 +47,31 @@ def _initialize_backend(self):
                 zval = self.xlb_lattice._c[2][c_idx]
                 self.neon_stencil.append([xval, yval, zval])
 
-        self.bk = neon.Backend(
-            runtime=neon.Backend.Runtime.stream,
-            dev_idx_list=dev_idx_list)
+        self.bk = neon.Backend(runtime=neon.Backend.Runtime.stream, dev_idx_list=dev_idx_list)
 
-        self.grid = neon.dense.dGrid(
-            backend=self.bk,
-            dim=self.dim,
-            sparsity=None,
-            stencil=self.neon_stencil)
+        self.grid = neon.dense.dGrid(backend=self.bk, dim=self.dim, sparsity=None, stencil=self.neon_stencil)
         pass
 
     def create_field(
-            self,
-            cardinality: int,
-            dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None,
-            fill_value=None,
+        self,
+        cardinality: int,
+        dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None,
+        fill_value=None,
     ):
         dtype = dtype.wp_dtype if dtype else DefaultConfig.default_precision_policy.store_precision.wp_dtype
-        field = self.grid.new_field(cardinality=cardinality,
-                                    dtype=dtype, )
+        field = self.grid.new_field(
+            cardinality=cardinality,
+            dtype=dtype,
+        )
 
         if fill_value is None:
-            field.zero_run(stream_idx = 0)
+            field.zero_run(stream_idx=0)
         else:
-            field.fill_run(value=fill_value,stream_idx = 0)
+            field.fill_run(value=fill_value, stream_idx=0)
         return field
 
-    def _create_warp_field(self,
-                           cardinality: int,
-                           dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None,
-                           fill_value=None,
-                           ne_field=None
+    def _create_warp_field(
+        self, cardinality: int, dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None, fill_value=None, ne_field=None
     ):
         warp_field = self.warp_grid.create_field(cardinality, dtype, fill_value)
         if ne_field is None:
@@ -92,12 +80,9 @@ def _create_warp_field(self,
         _d = self.xlb_lattice.d
 
         import typing
+
         @neon.Container.factory
-        def container(
-                src_field: typing.Any,
-                dst_field: typing.Any,
-                cardinality: wp.int32
-        ):
+        def container(src_field: typing.Any, dst_field: typing.Any, cardinality: wp.int32):
             def loading_step(loader: neon.Loader):
                 loader.declare_execution_scope(self.grid)
                 src_pn = loader.get_read_handel(src_field)
@@ -114,9 +99,7 @@ def cloning(gridIdx: typing.Any):
                         gy, gz = gz, gy
 
                     for card in range(cardinality):
-                        value = wp.neon_read(src_pn,
-                                      gridIdx,
-                                      card)
+                        value = wp.neon_read(src_pn, gridIdx, card)
                         dst_field[card, gx, gy, gz] = value
 
                 loader.declare_kernel(cloning)
diff --git a/xlb/helper/initializers.py b/xlb/helper/initializers.py
index 510648dd..d94cfa0f 100644
--- a/xlb/helper/initializers.py
+++ b/xlb/helper/initializers.py
@@ -2,6 +2,7 @@
 from xlb.operator.equilibrium import QuadraticEquilibrium
 from xlb.operator.equilibrium import MultiresQuadraticEquilibrium
 
+
 def initialize_eq(f, grid, velocity_set, precision_policy, compute_backend, rho=None, u=None):
     if rho is None:
         rho = grid.create_field(cardinality=1, fill_value=1.0, dtype=precision_policy.compute_precision)
@@ -11,15 +12,12 @@ def initialize_eq(f, grid, velocity_set, precision_policy, compute_backend, rho=
 
     if compute_backend == ComputeBackend.JAX:
         f = equilibrium(rho, u)
-
     elif compute_backend == ComputeBackend.WARP:
         f = equilibrium(rho, u, f)
-
-    elif backend == ComputeBackend.NEON:
+    elif compute_backend == ComputeBackend.NEON:
         f = equilibrium(rho, u, f)
-        pass
     else:
-        raise NotImplementedError(f"Backend {backend} not implemented")
+        raise NotImplementedError(f"Backend {compute_backend} not implemented")
 
     del rho, u
 
@@ -30,9 +28,5 @@ def initialize_multires_eq(f, grid, velocity_set, precision_policy, backend, rho
     equilibrium = MultiresQuadraticEquilibrium()
     for level in range(grid.count_levels):
         print("MultiresQuadraticEquilibrium")
-        equilibrium(level = level,
-                    rho= rho,
-                    u=u,
-                    f=f,
-                    stream= 0)
-    return f
\ No newline at end of file
+        equilibrium(level=level, rho=rho, u=u, f=f, stream=0)
+    return f
diff --git a/xlb/helper/nse_multires_solver.py b/xlb/helper/nse_multires_solver.py
index e2e40245..662aa94d 100644
--- a/xlb/helper/nse_multires_solver.py
+++ b/xlb/helper/nse_multires_solver.py
@@ -19,10 +19,9 @@ def __init__(self, grid, velocity_set, stepper, omega):
         # Create fields
         self.rho = grid.create_field(cardinality=1, dtype=self.precision_policy.store_precision)
         self.u = grid.create_field(cardinality=3, dtype=self.precision_policy.store_precision)
-        self.coalescence_factor = grid.create_field(cardinality=velocity_set.q,
-                                                    dtype=self.precision_policy.store_precision)
+        self.coalescence_factor = grid.create_field(cardinality=velocity_set.q, dtype=self.precision_policy.store_precision)
 
-        fname_prefix = 'test'
+        fname_prefix = "test"
 
         for level in range(self.count_levels):
             self.u.fill_run(level, 0.0, 0)
@@ -42,8 +41,6 @@ def __init__(self, grid, velocity_set, stepper, omega):
         # wp.synchronize()
         # self.u.export_vti(f"u_t2_{fname_prefix}_topology.vti", 'u')
 
-        self.odd_step = None
-        self.even_step = None
         self.iteration_idx = -1
         from xlb.operator.macroscopic import MultiresMacroscopic
 
@@ -61,18 +58,6 @@ def __init_containers(self, num_levels):
         self.containers = {}
         self.macroscopics = {}
 
-        # for target_level in range(num_levels):
-        #     self.containers[f"{target_level}"] = self.stepper.get_containers(target_level,
-        #                                              self.f_0,
-        #                                              self.f_1,
-        #                                              self.bc_mask,
-        #                                              self.missing_mask,
-        #                                              self.omega,
-        #                                              self.iteration_idx)
-        #     pass
-
-        # for target_level in range(num_levels):
-        #     self.macroscopics[f"{target_level}"] = self.macro.get_containers(target_level, self.f_0, self.f_1, self.bc_mask, self.rho, self.u)
         self.stepper.init_containers()
         self.macro.init_containers()
 
@@ -81,10 +66,11 @@ def export_macroscopic(self, fname_prefix):
         self.macro.launch_container(streamId=0, f_0=self.f_0, bc_mask=self.bc_mask, rho=self.rho, u=self.u)
 
         import warp as wp
+
         wp.synchronize()
         self.u.update_host(0)
         wp.synchronize()
-        self.u.export_vti(f"{fname_prefix}{self.iteration_idx}.vti", 'u')
+        self.u.export_vti(f"{fname_prefix}{self.iteration_idx}.vti", "u")
         print("DONE exporting macroscopic")
 
         return
diff --git a/xlb/helper/nse_solver.py b/xlb/helper/nse_solver.py
index b22a9491..7e143586 100644
--- a/xlb/helper/nse_solver.py
+++ b/xlb/helper/nse_solver.py
@@ -4,6 +4,7 @@
 from typing import Tuple
 import neon
 
+
 def create_nse_fields(
     grid_shape: Tuple[int, int, int] = None,
     grid=None,
@@ -35,75 +36,7 @@ def create_nse_fields(
     # Create fields
     f_0 = grid.create_field(cardinality=velocity_set.q, dtype=precision_policy.store_precision)
     f_1 = grid.create_field(cardinality=velocity_set.q, dtype=precision_policy.store_precision)
-    missing_mask = grid.create_field(cardinality=velocity_set.q, dtype=Precision.UINT8)
+    missing_mask = grid.create_field(cardinality=velocity_set.q, dtype=Precision.BOOL)
     bc_mask = grid.create_field(cardinality=1, dtype=Precision.UINT8)
 
     return grid, f_0, f_1, missing_mask, bc_mask
-
-class Nse_simulation:
-    def __init__(self, grid, velocity_set, stepper, omega):
-        self.stepper = stepper
-        self.grid = stepper.get_grid()
-        self.precision_policy = stepper.get_precision_policy()
-        self.velocity_set = velocity_set
-        self.omega = omega
-
-        # Create fields
-        self.f_0, self.f_1, self.bc_mask, self.missing_mask = stepper.prepare_fields()
-        # self.f_0 = grid.create_field(cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision)
-        # self.f_1 = grid.create_field(cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision)
-        # self.missing_mask = grid.create_field(cardinality=self.velocity_set.q, dtype=Precision.UINT8)
-        # self.bc_mask = grid.create_field(cardinality=1, dtype=Precision.UINT8)
-
-        self.rho = grid.create_field(cardinality=1, dtype=self.precision_policy.store_precision)
-        self.u = grid.create_field(cardinality=3, dtype=self.precision_policy.store_precision)
-
-        self.odd_step = None
-        self.even_step = None
-        self.iteration_idx = -1
-        from xlb.operator.macroscopic import Macroscopic
-
-        self.macro = Macroscopic(
-            compute_backend=self.grid.compute_backend,
-            precision_policy=self.precision_policy,
-            velocity_set=self.velocity_set,
-        )
-
-        self.__init_containers()
-
-    def __init_containers(self):
-        containers = self.stepper.get_containers(self.f_0, self.f_1, self.bc_mask, self.missing_mask, self.omega, self.iteration_idx)
-        self.even_step = containers['even']
-        self.odd_step = containers['odd']
-
-        containers = self.macro.get_containers(self.f_0, self.f_1,self.rho, self.u)
-
-        self.even_macroscopic = containers['even']
-        self.odd_macroscopic = containers['odd']
-
-        self.skeleton_even = neon.Skeleton(self.grid.get_neon_backend())
-        self.skeleton_odd = neon.Skeleton(self.grid.get_neon_backend())
-
-        self.skeleton_even.sequence(name="even lbm", containers=[self.even_step])
-        self.skeleton_odd.sequence(name="odd lbm", containers=[self.odd_step])
-
-    def export_macroscopic(self, fname_prefix):
-        if self.iteration_idx % 2 == 0:
-            self.even_macroscopic.run(0)
-        else:
-            self.odd_macroscopic.run(0)
-
-        import warp as wp
-        wp.synchronize()
-        self.u.update_host(0)
-        wp.synchronize()
-        self.u.export_vti(f"{fname_prefix}{self.iteration_idx}.vti", 'u')
-
-        return
-
-    def step(self):
-        self.iteration_idx += 1
-        if self.iteration_idx % 2 == 0:
-            self.even_step.run(0)
-        else:
-            self.odd_step.run(0)
diff --git a/xlb/operator/boundary_condition/bc_fullway_bounce_back.py b/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
index 3e1be12b..304f8264 100644
--- a/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
@@ -86,5 +86,5 @@ def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
         return f_post
 
     def _construct_neon(self):
-        functional,  _  = self._construct_warp()
-        return functional, None
\ No newline at end of file
+        functional, _ = self._construct_warp()
+        return functional, None
diff --git a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
index aaa565aa..16c40f2b 100644
--- a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
@@ -99,33 +99,10 @@ def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
         return f_post
 
     def _construct_neon(self):
-        # Set local constants
-        _opp_indices = self.velocity_set.opp_indices
-
-        # Construct the functional for this BC
-        @wp.func
-        def functional(
-            index: Any,
-            timestep: Any,
-            missing_mask: Any,
-            f_0: Any,
-            f_1: Any,
-            f_pre: Any,
-            f_post: Any,
-        ):
-            # Post-streaming values are only modified at missing direction
-            _f = f_post
-            for l in range(self.velocity_set.q):
-                # If the mask is missing then take the opposite index
-                if missing_mask[l] == wp.uint8(1):
-                    # Get the pre-streaming distribution function in oppisite direction
-                    _f[l] = f_pre[_opp_indices[l]]
-
-            return _f
-
+        functional, _ = self._construct_warp()
         return functional, None
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, f_pre, f_post, bc_mask, missing_mask):
         # rise exception as this feature is not implemented yet
-        raise NotImplementedError("This feature is not implemented in NEON yet.")
+        raise NotImplementedError("This feature is not implemented in XLB with the NEON backend yet.")
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 3eed597f..1740f689 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -295,15 +295,16 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None, x
         wp.synchronize()
 
         import neon, typing
+
         @neon.Container.factory("")
         def container(
-                bc_mask_warp: typing.Any,
-                missing_mask_warp: typing.Any,
-                bc_mask_field: typing.Any,
-                missing_mask_field: typing.Any,
+            bc_mask_warp: typing.Any,
+            missing_mask_warp: typing.Any,
+            bc_mask_field: typing.Any,
+            missing_mask_field: typing.Any,
         ):
             def loading_step(loader: neon.Loader):
-                loader.set_mres_grid(bc_mask.get_grid(), 0)
+                loader.set_mres_grid(bc_mask.get_grid(), level=0)
 
                 bc_mask_hdl = loader.get_mres_write_handle(bc_mask_field)
                 missing_mask_hdl = loader.get_mres_write_handle(missing_mask_field)
@@ -315,23 +316,12 @@ def masker(gridIdx: typing.Any):
                     gy = wp.neon_get_y(cIdx)
                     gz = wp.neon_get_z(cIdx)
                     # TODO@Max - XLB is flattening the y dimension in 3D, while neon uses the z dimension
-                    local_mask = bc_mask_warp[
-                        0,
-                        gx,
-                        gz,
-                        gy]
+                    local_mask = bc_mask_warp[0, gx, gz, gy]
                     wp.neon_write(bc_mask_hdl, gridIdx, 0, local_mask)
 
                     for q in range(self.velocity_set.q):
-                        is_missing = wp.uint8( missing_mask_warp[
-                            q,
-                            wp.neon_get_x(cIdx),
-                            wp.neon_get_z(cIdx),
-                            wp.neon_get_y(cIdx)])
-                        wp.neon_write(missing_mask_hdl,
-                                      gridIdx,
-                                      q,
-                                      is_missing)
+                        is_missing = wp.uint8(missing_mask_warp[q, wp.neon_get_x(cIdx), wp.neon_get_z(cIdx), wp.neon_get_y(cIdx)])
+                        wp.neon_write(missing_mask_hdl, gridIdx, q, is_missing)
 
                 loader.declare_kernel(masker)
 
diff --git a/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py b/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
index f7ac3e79..e1346b1b 100644
--- a/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
+++ b/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
@@ -5,7 +5,7 @@
 import os
 
 # Print the PYTHONPATH
-pythonpath = os.environ.get('PYTHONPATH', 'PYTHONPATH is not set')
+pythonpath = os.environ.get("PYTHONPATH", "PYTHONPATH is not set")
 print(f"PYTHONPATH: {pythonpath}")
 import neon
 from typing import Any
@@ -23,6 +23,7 @@ class MultiresQuadraticEquilibrium(Equilibrium):
 
     def _construct_neon(self):
         import neon
+
         # Set local constants TODO: This is a hack and should be fixed with warp update
         _c = self.velocity_set.c
         _w = self.velocity_set.w
@@ -58,20 +59,20 @@ def functional(
             return feq
 
         import typing
+
         @neon.Container.factory(name="QuadraticEquilibrium")
         def container(
-                level,
+            level,
             rho: Any,
             u: Any,
             f: Any,
         ):
-
-            def quadratic_equilibrium_ll(loader:neon.Loader):
+            def quadratic_equilibrium_ll(loader: neon.Loader):
                 loader.set_mres_grid(rho.get_grid(), level)
 
-                rho_pn=loader.get_mres_read_handle(rho)
-                u_pn =loader.get_mres_read_handle(u)
-                f_pn=loader.get_mres_write_handle(f)
+                rho_pn = loader.get_mres_read_handle(rho)
+                u_pn = loader.get_mres_read_handle(u)
+                f_pn = loader.get_mres_write_handle(f)
 
                 @wp.func
                 def quadratic_equilibrium_cl(index: typing.Any):
@@ -87,13 +88,16 @@ def quadratic_equilibrium_cl(index: typing.Any):
                     # Set the output
                     for l in range(self.velocity_set.q):
                         wp.neon_write(f_pn, index, l, feq[l])
+
                 loader.declare_kernel(quadratic_equilibrium_cl)
+
             return quadratic_equilibrium_ll
+
         return functional, container
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, level, rho, u, f, stream):
-        c = self.neon_container( level, rho, u, f)
+        c = self.neon_container(level, rho, u, f)
         c.run(stream, container_runtime=neon.Container.ContainerRuntime.neon)
 
         return f
diff --git a/xlb/operator/equilibrium/quadratic_equilibrium.py b/xlb/operator/equilibrium/quadratic_equilibrium.py
index 31dbfed0..89e2aaef 100644
--- a/xlb/operator/equilibrium/quadratic_equilibrium.py
+++ b/xlb/operator/equilibrium/quadratic_equilibrium.py
@@ -5,7 +5,7 @@
 import os
 
 # Print the PYTHONPATH
-pythonpath = os.environ.get('PYTHONPATH', 'PYTHONPATH is not set')
+pythonpath = os.environ.get("PYTHONPATH", "PYTHONPATH is not set")
 print(f"PYTHONPATH: {pythonpath}")
 import neon
 from typing import Any
@@ -103,9 +103,9 @@ def warp_implementation(self, rho, u, f):
         )
         return f
 
-
     def _construct_neon(self):
         import neon
+
         # Set local constants TODO: This is a hack and should be fixed with warp update
         _c = self.velocity_set.c
         _w = self.velocity_set.w
@@ -141,18 +141,18 @@ def functional(
             return feq
 
         import neon, typing
+
         @neon.Container.factory(name="QuadraticEquilibrium")
         def container(
             rho: Any,
             u: Any,
             f: Any,
         ):
-
-            def quadratic_equilibrium_ll(loader:neon.Loader):
+            def quadratic_equilibrium_ll(loader: neon.Loader):
                 loader.set_grid(rho.get_grid())
-                rho_pn=loader.get_read_handle(rho)
-                u_pn =loader.get_read_handle(u)
-                f_pn=loader.get_write_handle(f)
+                rho_pn = loader.get_read_handle(rho)
+                u_pn = loader.get_read_handle(u)
+                f_pn = loader.get_write_handle(f)
 
                 @wp.func
                 def quadratic_equilibrium_cl(index: typing.Any):
@@ -164,15 +164,17 @@ def quadratic_equilibrium_cl(index: typing.Any):
 
                     # Set the output
                     for l in range(self.velocity_set.q):
-                        #wp.neon_write(f_pn, index, l, self.store_dtype(feq[l]))
+                        # wp.neon_write(f_pn, index, l, self.store_dtype(feq[l]))
                         wp.neon_write(f_pn, index, l, feq[l])
+
                 loader.declare_kernel(quadratic_equilibrium_cl)
+
             return quadratic_equilibrium_ll
+
         return functional, container
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, rho, u, f):
-        c = self.neon_container( rho, u, f)
+        c = self.neon_container(rho, u, f)
         c.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
-
         return f
diff --git a/xlb/operator/macroscopic/first_moment.py b/xlb/operator/macroscopic/first_moment.py
index 5555f812..67798e13 100644
--- a/xlb/operator/macroscopic/first_moment.py
+++ b/xlb/operator/macroscopic/first_moment.py
@@ -67,10 +67,10 @@ def warp_implementation(self, f, rho, u):
         return u
 
     def _construct_neon(self):
-        functional,_  = self._construct_warp()
+        functional, _ = self._construct_warp()
         return functional, None
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, f, rho):
         # rise exception as this feature is not implemented yet
-        raise NotImplementedError("This feature is not implemented in NEON yet.")
\ No newline at end of file
+        raise NotImplementedError("This feature is not implemented in XLB with the NEON backend yet.")
diff --git a/xlb/operator/macroscopic/macroscopic.py b/xlb/operator/macroscopic/macroscopic.py
index b1927b24..eed2ac10 100644
--- a/xlb/operator/macroscopic/macroscopic.py
+++ b/xlb/operator/macroscopic/macroscopic.py
@@ -76,38 +76,40 @@ def functional(f: _f_vec):
             u = first_moment_func(f, rho)
             return rho, u
 
-
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
 
         import neon, typing
+
         @neon.Container.factory("macroscopic")
         def container(
-                f_field: Any,
-                rho_field: Any,
-                u_fild: Any,
+            f_field: Any,
+            rho_field: Any,
+            u_fild: Any,
         ):
             _d = self.velocity_set.d
+
             def macroscopic_ll(loader: neon.Loader):
                 loader.set_grid(f_field.get_grid())
 
-                rho=loader.get_read_handle(rho_field)
-                u =loader.get_read_handle(u_fild)
-                f=loader.get_read_handle(f_field)
+                rho = loader.get_read_handle(rho_field)
+                u = loader.get_read_handle(u_fild)
+                f = loader.get_read_handle(f_field)
 
                 @wp.func
                 def macroscopic_cl(gIdx: typing.Any):
                     _f = _f_vec()
                     for l in range(self.velocity_set.q):
-                        _f[l] = wp.neon_read(f, gIdx,l)
+                        _f[l] = wp.neon_read(f, gIdx, l)
                     _rho, _u = functional(_f)
                     wp.neon_write(rho, gIdx, 0, _rho)
                     for d in range(_d):
                         wp.neon_write(u, gIdx, d, _u[d])
 
                 loader.declare_kernel(macroscopic_cl)
+
             return macroscopic_ll
-        return functional, container
 
+        return functional, container
 
     def _construct_neon_visual(self):
         zero_moment_func = self.zero_moment.neon_functional
@@ -120,32 +122,34 @@ def functional(f: _f_vec):
             u = first_moment_func(f, rho)
             return rho, u
 
-
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
 
         import neon, typing
+
         @neon.Container.factory("macroscopic")
         def container(
-                f_field: Any,
-                bc_mask: Any,
-                rho_field: Any,
-                u_fild: Any,
+            f_field: Any,
+            bc_mask: Any,
+            rho_field: Any,
+            u_fild: Any,
         ):
             _d = self.velocity_set.d
+
             def macroscopic_ll(loader: neon.Loader):
                 loader.set_grid(f_field.get_grid())
 
-                rho=loader.get_read_handle(rho_field)
-                u =loader.get_read_handle(u_fild)
-                f=loader.get_read_handle(f_field)
+                rho = loader.get_read_handle(rho_field)
+                u = loader.get_read_handle(u_fild)
+                f = loader.get_read_handle(f_field)
                 bc_mask_pn = loader.get_read_handle(bc_mask)
+
                 @wp.func
                 def macroscopic_cl(gIdx: typing.Any):
                     _f = _f_vec()
                     _boundary_id = wp.neon_read(bc_mask_pn, gIdx, 0)
 
                     for l in range(self.velocity_set.q):
-                        _f[l] = wp.neon_read(f, gIdx,l)
+                        _f[l] = wp.neon_read(f, gIdx, l)
                     _rho, _u = functional(_f)
                     if _boundary_id != wp.uint8(0):
                         _rho = self.compute_dtype(1.0)
@@ -161,19 +165,10 @@ def macroscopic_cl(gIdx: typing.Any):
                         wp.neon_write(u, gIdx, d, _u[d])
 
                 loader.declare_kernel(macroscopic_cl)
-            return macroscopic_ll
-        return functional, container
 
+            return macroscopic_ll
 
-    def get_containers(self, f_0, f_1, rho, u):
-        _, container = self._construct_neon()
-        return {'even': container(f_0,   rho, u),
-                'odd': container(f_1,  rho, u)}
-
-    def get_containers_visual(self, f_0, f_1, bc_mask, rho, u):
-        _, container = self._construct_neon()
-        return {'even': container(f_0,  bc_mask, rho, u),
-                'odd': container(f_1, bc_mask,  rho, u)}
+        return functional, container
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, f, rho, u):
diff --git a/xlb/operator/macroscopic/multires_macroscopic.py b/xlb/operator/macroscopic/multires_macroscopic.py
index e57b8e21..0a5bc92e 100644
--- a/xlb/operator/macroscopic/multires_macroscopic.py
+++ b/xlb/operator/macroscopic/multires_macroscopic.py
@@ -18,7 +18,6 @@ def __init__(self, *args, **kwargs):
         self.first_moment = FirstMoment(*args, **kwargs)
         super().__init__(*args, **kwargs)
 
-
     def _construct_warp(self):
         zero_moment_func = self.zero_moment.warp_functional
         first_moment_func = self.first_moment.warp_functional
@@ -71,10 +70,10 @@ def functional(f: _f_vec):
             u = first_moment_func(f, rho)
             return rho, u
 
-
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
 
         import neon, typing
+
         @neon.Container.factory("macroscopic")
         def container(
             level: int,
@@ -84,12 +83,13 @@ def container(
             u_fild: Any,
         ):
             _d = self.velocity_set.d
+
             def macroscopic_ll(loader: neon.Loader):
                 loader.set_mres_grid(f_field.get_grid(), level)
 
-                rho=loader.get_mres_write_handle(rho_field)
-                u =loader.get_mres_write_handle(u_fild)
-                f=loader.get_mres_read_handle(f_field)
+                rho = loader.get_mres_write_handle(rho_field)
+                u = loader.get_mres_write_handle(u_fild)
+                f = loader.get_mres_read_handle(f_field)
                 bc_mask_pn = loader.get_mres_read_handle(bc_mask)
 
                 @wp.func
@@ -98,7 +98,7 @@ def macroscopic_cl(gIdx: typing.Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, gIdx, 0)
 
                     for l in range(self.velocity_set.q):
-                        _f[l] = wp.neon_read(f, gIdx,l)
+                        _f[l] = wp.neon_read(f, gIdx, l)
 
                     _rho, _u = functional(_f)
 
@@ -117,35 +117,19 @@ def macroscopic_cl(gIdx: typing.Any):
                         wp.neon_write(u, gIdx, d, _u[d])
 
                 loader.declare_kernel(macroscopic_cl)
+
             return macroscopic_ll
-        return functional, container
 
-    def get_containers(self, target_level, f_0, f_1, bc_mask, rho, u):
-        _, container = self._construct_neon()
-        evenList = []
-        oddList = []
-        evenList.append(container(target_level, f_0, bc_mask,   rho, u))
-        oddList.append( container(target_level, f_1, bc_mask,  rho, u))
-        return {'even':evenList ,
-                'odd':oddList }
-
-    def get_container(self, target_level, f_0, f_1, bc_mask, rho, u):
-        _, self.container = self._construct_neon()
-        evenList = []
-        oddList = []
-        evenList.append(container(target_level, f_0, bc_mask, rho, u))
-        oddList.append(container(target_level, f_1, bc_mask, rho, u))
-        return {"macro": evenList, "odd": oddList}
+        return functional, container
 
     def init_containers(self):
-        self.containers=None
+        self.containers = None
         _, self.containers = self._construct_neon()
 
-    def launch_container(self, streamId, f_0,  bc_mask, rho, u):
+    def launch_container(self, streamId, f_0, bc_mask, rho, u):
         grid = f_0.get_grid()
         for target_level in range(grid.num_levels):
-                self.containers(target_level, f_0, bc_mask, rho, u).run(streamId)
-
+            self.containers(target_level, f_0, bc_mask, rho, u).run(streamId)
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, f, rho, u):
diff --git a/xlb/operator/macroscopic/zero_moment.py b/xlb/operator/macroscopic/zero_moment.py
index 13d69b62..da9d034e 100644
--- a/xlb/operator/macroscopic/zero_moment.py
+++ b/xlb/operator/macroscopic/zero_moment.py
@@ -49,10 +49,10 @@ def warp_implementation(self, f, rho):
         return rho
 
     def _construct_neon(self):
-        functional,_  = self._construct_warp()
+        functional, _ = self._construct_warp()
         return functional, None
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, f, rho):
         # rise exception as this feature is not implemented yet
-        raise NotImplementedError("This feature is not implemented in NEON yet.")
+        raise NotImplementedError("This feature is not implemented in XLB with the NEON backend yet.")
diff --git a/xlb/operator/operator.py b/xlb/operator/operator.py
index 9a9f98e5..cc4d7c79 100644
--- a/xlb/operator/operator.py
+++ b/xlb/operator/operator.py
@@ -73,9 +73,7 @@ def __call__(self, *args, callback=None, **kwargs):
                 error = e
                 traceback_str = traceback.format_exc()
                 continue  # This skips to the next candidate if binding fails
-        method_candidates = [
-            (key, method) for key, method in self._backends.items() if key[1] == self.compute_backend
-        ]
+        method_candidates = [(key, method) for key, method in self._backends.items() if key[1] == self.compute_backend]
         raise Exception(f"Error captured for backend with key {key} for operator {self.__class__.__name__}: {error}\n {traceback_str}")
 
     @property
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 6a6bd01c..20597fbf 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -26,17 +26,14 @@
 
 class MultiresIncompressibleNavierStokesStepper(Stepper):
     def __init__(
-            self,
-            grid,
-            boundary_conditions=[],
-            collision_type="BGK",
-            forcing_scheme="exact_difference",
-            force_vector=None,
+        self,
+        grid,
+        boundary_conditions=[],
+        collision_type="BGK",
+        forcing_scheme="exact_difference",
+        force_vector=None,
     ):
         super().__init__(grid, boundary_conditions)
-        self.odd_or_even = 'even'
-        self.c_even = None
-        self.c_odd = None
 
         # Construct the collision operator
         if collision_type == "BGK":
@@ -45,8 +42,7 @@ def __init__(
             self.collision = KBC(self.velocity_set, self.precision_policy, self.compute_backend)
 
         if force_vector is not None:
-            self.collision = ForcedCollision(collision_operator=self.collision, forcing_scheme=forcing_scheme,
-                                             force_vector=force_vector)
+            self.collision = ForcedCollision(collision_operator=self.collision, forcing_scheme=forcing_scheme, force_vector=force_vector)
 
         # Construct the operators
         self.stream = Stream(self.velocity_set, self.precision_policy, self.compute_backend)
@@ -76,25 +72,24 @@ def prepare_fields(self, rho, u, initializer=None):
         bc_mask = self.grid.create_field(cardinality=1, dtype=Precision.UINT8)
 
         from xlb.helper.initializers import initialize_multires_eq
-        f_0 = initialize_multires_eq(f_0, self.grid, self.velocity_set, self.precision_policy, self.compute_backend,
-                                     rho=rho, u=u)
+
+        f_0 = initialize_multires_eq(f_0, self.grid, self.velocity_set, self.precision_policy, self.compute_backend, rho=rho, u=u)
 
         for level in range(self.grid.count_levels):
             f_1.copy_from_run(level, f_0, 0)
-        f_0.update_host(0)
-        wp.synchronize()
+        # f_0.update_host(0)
+        # wp.synchronize()
         # f_0.export_vti("f0_eq_init.vti", "init_f0")
 
         # Process boundary conditions and update masks
-        bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, bc_mask, missing_mask,
-                                                                  xlb_grid=self.grid)
+        bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, bc_mask, missing_mask, xlb_grid=self.grid)
         # Initialize auxiliary data if needed
         f_0, f_1 = self._initialize_auxiliary_data(self.boundary_conditions, f_0, f_1, bc_mask, missing_mask)
         # bc_mask.update_host(0)
         bc_mask.update_host(0)
         f_0.update_host(0)
         wp.synchronize()
-        bc_mask.export_vti("bc_mask.vti", 'bc_mask')
+        bc_mask.export_vti("bc_mask.vti", "bc_mask")
         # f_0.export_vti("init_f0.vti", 'init_f0')
         # missing_mask.export_vti("missing_mask.vti", 'missing_mask')
 
@@ -114,6 +109,7 @@ def ll_coalescence_count(loader: neon.Loader):
                 _c = self.velocity_set.c
                 _w = self.velocity_set.w
                 import typing
+
                 @wp.func
                 def cl_collide_coarse(index: typing.Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
@@ -122,9 +118,7 @@ def cl_collide_coarse(index: typing.Any):
                     if not wp.neon_has_child(coalescence_factor_pn, index):
                         for l in range(self.velocity_set.q):
                             if level < num_levels - 1:
-                                push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]),
-                                                                 wp.int8(_c[1, l]),
-                                                                 wp.int8(_c[2, l]))
+                                push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
                                 val = self.compute_dtype(1)
                                 wp.neon_mres_lbm_store_op(coalescence_factor_pn, index, l, push_direction, val)
 
@@ -147,6 +141,7 @@ def loading(loader: neon.Loader):
                 _c = self.velocity_set.c
                 _w = self.velocity_set.w
                 import typing
+
                 @wp.func
                 def compute(index: typing.Any):
                     # _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
@@ -172,13 +167,12 @@ def compute(index: typing.Any):
                             # HERE, we skip the center direction
                             continue
 
-                        pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]),
-                                                         wp.int8(-_c[1, l]),
-                                                         wp.int8(-_c[2, l]))
+                        pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
 
                         has_ngh_at_same_level = wp.bool(False)
-                        coalescence_factor = wp.neon_read_ngh(coalescence_factor_pn, index, pull_direction, l,
-                                                              self.compute_dtype(0), has_ngh_at_same_level)
+                        coalescence_factor = wp.neon_read_ngh(
+                            coalescence_factor_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level
+                        )
 
                         # if (!pin.hasChildren(cell, dir)) {
                         if not wp.neon_has_finer_ngh(coalescence_factor_pn, index, pull_direction):
@@ -199,8 +193,7 @@ def compute(index: typing.Any):
                                 # YES ngh. at the same level
                                 # -> **Coalescence**
                                 if coalescence_factor > self.compute_dtype(0):
-                                    coalescence_factor = self.compute_dtype(1) / (
-                                            self.compute_dtype(2) * coalescence_factor)
+                                    coalescence_factor = self.compute_dtype(1) / (self.compute_dtype(2) * coalescence_factor)
                                     wp.neon_write(coalescence_factor_pn, index, l, coalescence_factor)
 
                             else:
@@ -275,15 +268,15 @@ def _construct_neon(self):
 
         @wp.func
         def apply_bc(
-                index: Any,
-                timestep: Any,
-                _boundary_id: Any,
-                missing_mask: Any,
-                f_0: Any,
-                f_1: Any,
-                f_pre: Any,
-                f_post: Any,
-                is_post_streaming: bool,
+            index: Any,
+            timestep: Any,
+            _boundary_id: Any,
+            missing_mask: Any,
+            f_0: Any,
+            f_1: Any,
+            f_pre: Any,
+            f_post: Any,
+            is_post_streaming: bool,
         ):
             f_result = f_post
 
@@ -292,15 +285,11 @@ def apply_bc(
                 if is_post_streaming:
                     if wp.static(self.boundary_conditions[i].implementation_step == ImplementationStep.STREAMING):
                         if _boundary_id == wp.static(self.boundary_conditions[i].id):
-                            f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep,
-                                                                                              missing_mask, f_0, f_1,
-                                                                                              f_pre, f_post)
+                            f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep, missing_mask, f_0, f_1, f_pre, f_post)
                 else:
                     if wp.static(self.boundary_conditions[i].implementation_step == ImplementationStep.COLLISION):
                         if _boundary_id == wp.static(self.boundary_conditions[i].id):
-                            f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep,
-                                                                                              missing_mask, f_0, f_1,
-                                                                                              f_pre, f_post)
+                            f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep, missing_mask, f_0, f_1, f_pre, f_post)
                     if wp.static(self.boundary_conditions[i].id in extrapolation_outflow_bc_ids):
                         if _boundary_id == wp.static(self.boundary_conditions[i].id):
                             f_result = wp.static(self.boundary_conditions[i].prepare_bc_auxilary_data)(
@@ -310,10 +299,10 @@ def apply_bc(
 
         @wp.func
         def neon_get_thread_data(
-                f0_pn: Any,
-                f1_pn: Any,
-                missing_mask_pn: Any,
-                index: Any,
+            f0_pn: Any,
+            f1_pn: Any,
+            missing_mask_pn: Any,
+            index: Any,
         ):
             # Read thread data for populations
             _f0_thread = _f_vec()
@@ -331,19 +320,16 @@ def neon_get_thread_data(
 
         @neon.Container.factory(name="collide_coarse")
         def collide_coarse(
-                level: int,
-                f_0_fd: Any,
-                f_1_fd: Any,
-                bc_mask_fd: Any,
-                missing_mask_fd: Any,
-                omega: Any,
-                timestep: int,
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+            timestep: int,
         ):
             num_levels = f_0_fd.get_grid().get_num_levels()
 
-            # module op to define odd of even iteration
-            even_itertation = wp.mod(timestep, 2) == 0
-
             def ll_collide_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
@@ -374,10 +360,8 @@ def device(index: typing.Any):
                         return
 
                     if not wp.neon_has_child(f_0_pn, index):
-
                         # Read thread data for populations, these are post streaming
-                        _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn,
-                                                                                     index)
+                        _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
                         _f_post_stream = _f0_thread
 
                         _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
@@ -388,10 +372,8 @@ def device(index: typing.Any):
                         # _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False)
 
                         for l in range(self.velocity_set.q):
-                            push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]),
-                                                             wp.int8(_c[1, l]),
-                                                             wp.int8(_c[2, l]))
-                            if (level < num_levels - 1):
+                            push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
+                            if level < num_levels - 1:
                                 val = _f_post_collision[l]
                                 wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, val)
                                 wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, val)
@@ -408,13 +390,13 @@ def device(index: typing.Any):
 
         @neon.Container.factory(name="stream_coarse_step_A")
         def stream_coarse_step_A(
-                level: int,
-                f_0_fd: Any,
-                f_1_fd: Any,
-                bc_mask_fd: Any,
-                missing_mask_fd: Any,
-                omega: Any,
-                timestep: int,
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+            timestep: int,
         ):
             num_levels = f_0_fd.get_grid().get_num_levels()
 
@@ -422,9 +404,6 @@ def stream_coarse_step_A(
             #     # throw an exception
             #     raise Exception("Only the finest level is supported for now")
 
-            # module op to define odd of even iteration
-            # od_or_even = wp.module("odd_or_even", "even")
-
             def ll_stream_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
@@ -449,10 +428,7 @@ def cl_stream_coarse(index: typing.Any):
 
                     # do stream normally
                     _missing_mask = _missing_mask_vec()
-                    _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn,
-                                                                                 f_1_pn,
-                                                                                 missing_mask_pn,
-                                                                                 index)
+                    _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
                     _f_post_collision = _f0_thread
                     _f_post_stream = self.stream.neon_functional(f_0_pn, index)
 
@@ -466,13 +442,13 @@ def cl_stream_coarse(index: typing.Any):
 
         @neon.Container.factory(name="stream_coarse_step_B")
         def stream_coarse_step_B(
-                level: int,
-                f_0_fd: Any,
-                f_1_fd: Any,
-                bc_mask_fd: Any,
-                missing_mask_fd: Any,
-                omega: Any,
-                timestep: int,
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+            timestep: int,
         ):
             def ll_stream_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
@@ -503,13 +479,10 @@ def cl_stream_coarse(index: typing.Any):
                             # HERE, we skip the center direction
                             continue
 
-                        pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]),
-                                                         wp.int8(-_c[1, l]),
-                                                         wp.int8(-_c[2, l]))
+                        pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
 
                         has_ngh_at_same_level = wp.bool(False)
-                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0),
-                                                       has_ngh_at_same_level)
+                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
 
                         # if (!pin.hasChildren(cell, dir)) {
                         if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
@@ -520,9 +493,9 @@ def cl_stream_coarse(index: typing.Any):
                                 if wp.neon_has_parent(f_0_pn, index):
                                     # YES halo cell on top of us
                                     has_a_courser_ngh = wp.bool(False)
-                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(f_0_pn, index, pull_direction, l,
-                                                                                self.compute_dtype(0),
-                                                                                has_a_courser_ngh)
+                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(
+                                        f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_courser_ngh
+                                    )
                                     if has_a_courser_ngh:
                                         # Full state:
                                         # NO finer ngh. in the pull direction (opposite of l)
@@ -559,15 +532,14 @@ def cl_stream_coarse(index: typing.Any):
 
         @neon.Container.factory(name="stream_coarse_step_C")
         def stream_coarse_step_C(
-                level: int,
-                f_0_fd: Any,
-                f_1_fd: Any,
-                bc_mask_fd: Any,
-                missing_mask_fd: Any,
-                omega: Any,
-                timestep: int,
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+            timestep: int,
         ):
-
             def ll_stream_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
@@ -598,13 +570,7 @@ def cl_stream_coarse(index: typing.Any):
                     _f_post_stream = _f1_thread
 
                     # do non mres post-streaming corrections
-                    _f_post_stream = apply_bc(
-                        index, timestep,
-                        _boundary_id,
-                        _missing_mask,
-                        f_0_pn, f_1_pn,
-                        _f_post_collision, _f_post_stream, True
-                    )
+                    _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
 
                     for l in range(self.velocity_set.q):
                         wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
@@ -618,15 +584,8 @@ def cl_stream_coarse(index: typing.Any):
             "collide_coarse": collide_coarse,
             "stream_coarse_step_A": stream_coarse_step_A,
             "stream_coarse_step_B": stream_coarse_step_B,
-            "stream_coarse_step_C": stream_coarse_step_C}
-
-    def get_containers(self, target_level, f_0, f_1, bc_mask, missing_mask, omega, timestep):
-        containers = {'even': {}, 'odd': {}}
-        _, container = self._construct_neon()
-        for key in container.keys():
-            containers['odd'][key] = container[key](target_level, f_1, f_0, bc_mask, missing_mask, omega, 1)
-            containers['even'][key] = container[key](target_level, f_0, f_1, bc_mask, missing_mask, omega, 0)
-        return containers
+            "stream_coarse_step_C": stream_coarse_step_C,
+        }
 
     def init_containers(self):
         self.containers = None
diff --git a/xlb/operator/stepper/nse_multires_stepper_vk.py b/xlb/operator/stepper/nse_multires_stepper_vk.py
index 3e45f95b..3f32c4bb 100644
--- a/xlb/operator/stepper/nse_multires_stepper_vk.py
+++ b/xlb/operator/stepper/nse_multires_stepper_vk.py
@@ -34,7 +34,7 @@ def __init__(
         force_vector=None,
     ):
         super().__init__(grid, boundary_conditions)
-        self.odd_or_even='even'
+        self.odd_or_even = "even"
         self.c_even = None
         self.c_odd = None
 
@@ -75,6 +75,7 @@ def prepare_fields(self, rho, u, initializer=None):
         bc_mask = self.grid.create_field(cardinality=1, dtype=Precision.UINT8)
 
         from xlb.helper.initializers import initialize_multires_eq
+
         f_0 = initialize_multires_eq(f_0, self.grid, self.velocity_set, self.precision_policy, self.compute_backend, rho=rho, u=u)
 
         for level in range(self.grid.count_levels):
@@ -91,10 +92,10 @@ def prepare_fields(self, rho, u, initializer=None):
         bc_mask.update_host(0)
         f_0.update_host(0)
         wp.synchronize()
-        bc_mask.export_vti("bc_mask.vti", 'bc_mask')
-        f_0.export_vti("init_f0.vti", 'init_f0')
+        bc_mask.export_vti("bc_mask.vti", "bc_mask")
+        f_0.export_vti("init_f0.vti", "init_f0")
 
-        #missing_mask.export_vti("missing_mask.vti", 'missing_mask')
+        # missing_mask.export_vti("missing_mask.vti", 'missing_mask')
 
         return f_0, f_1, bc_mask, missing_mask
 
@@ -142,7 +143,7 @@ def _construct_neon(self):
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
         _missing_mask_vec = wp.vec(self.velocity_set.q, dtype=wp.uint8)
         _opp_indices = self.velocity_set.opp_indices
-        #_cast_to_store_dtype = self.store_dtype()
+        # _cast_to_store_dtype = self.store_dtype()
 
         # Read the list of bc_to_id created upon instantiation
         bc_to_id = boundary_condition_registry.bc_to_id
@@ -207,15 +208,16 @@ def neon_get_thread_data(
             return _f0_thread, _f1_thread, _missing_mask
 
         import typing
+
         @neon.Container.factory(name="finest_collide")
         def single_step_finest(
-                level: int,
-                f_0_fd: Any,
-                f_1_fd: Any,
-                bc_mask_fd: Any,
-                missing_mask_fd: Any,
-                omega: Any,
-                timestep: int,
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+            timestep: int,
         ):
             # if level != 0:
             #     # throw an exception
@@ -227,11 +229,11 @@ def single_step_finest(
             def ll_single_step_finest(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
-                f_0_pn=loader.get_mres_read_handle(f_0_fd)
-                bc_mask_pn=loader.get_mres_read_handle(bc_mask_fd)
-                missing_mask_pn=loader.get_mres_read_handle(missing_mask_fd)
+                f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
 
-                f_1_pn =loader.get_mres_write_handle(f_1_fd)
+                f_1_pn = loader.get_mres_write_handle(f_1_fd)
 
                 @wp.func
                 def cl_single_step_finest(index: typing.Any):
@@ -249,21 +251,21 @@ def cl_single_step_finest(index: typing.Any):
                     _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
 
                     # Apply post-collision boundary conditions
-                    _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False)
+                    _f_post_collision = apply_bc(
+                        index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False
+                    )
 
                     # Apply streaming boundary conditions
                     _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, True)
                     _opposite_c_idx = self.velocity_set.self.opp_indices
 
                     for l in range(self.velocity_set.q):
-                        push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]),
-                                                   wp.int8(_c[1, l]),
-                                                   wp.int8(_c[2, l]))
+                        push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
                         ## Store
                         if od_or_even == 0:
                             wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, _f_post_stream[l])
                         else:
-                            wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction,_f_post_stream[l])
+                            wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, _f_post_stream[l])
 
                         ## Push stream
                         is_active = wp.neon_is_active(f_0_pn, index, push_direction)
@@ -289,30 +291,31 @@ def cl_single_step_finest(index: typing.Any):
                                     wp.neon_write(f_1_pn, index, l, _f_post_stream[l], value)
 
                 loader.declare_kernel(cl_single_step_finest)
+
             return ll_single_step_finest
 
         @neon.Container.factory(name="collide_coarse")
         def collide_coarse(
-                level: int,
-                f_0_fd: Any,
-                f_1_fd: Any,
-                bc_mask_fd: Any,
-                missing_mask_fd: Any,
-                omega: Any,
-                timestep: int,
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+            timestep: int,
         ):
             num_levels = f_0_fd.get_grid().get_num_levels()
 
             # module op to define odd of even iteration
-            even_itertation = wp.mod(timestep, 2)==0
+            even_itertation = wp.mod(timestep, 2) == 0
 
             def ll_collide_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
-                f_0_pn=loader.get_mres_read_handle(f_0_fd)
-                bc_mask_pn=loader.get_mres_read_handle(bc_mask_fd)
-                missing_mask_pn=loader.get_mres_read_handle(missing_mask_fd)
-                f_1_pn =loader.get_mres_write_handle(f_1_fd)
+                f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
+                f_1_pn = loader.get_mres_write_handle(f_1_fd)
 
                 _c = self.velocity_set.c
 
@@ -328,7 +331,6 @@ def cl_collide_coarse(index: typing.Any):
                         return
 
                     if not wp.neon_has_child(f_0_pn, index):
-
                         # Read thread data for populations, these are post streaming
                         _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
                         _f_post_stream = _f0_thread
@@ -338,11 +340,13 @@ def cl_collide_coarse(index: typing.Any):
                         _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
 
                         # Apply post-collision boundary conditions
-                        _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False)
+                        _f_post_collision = apply_bc(
+                            index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False
+                        )
 
                         for l in range(self.velocity_set.q):
                             push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
-                            if(level < num_levels - 1):
+                            if level < num_levels - 1:
                                 ## Store
                                 # if even_itertation == 0:
                                 #     wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, _f_post_collision[l])
@@ -356,6 +360,7 @@ def cl_collide_coarse(index: typing.Any):
                     wp.print("stream_coarse")
 
                 loader.declare_kernel(cl_collide_coarse)
+
             return ll_collide_coarse
 
         @neon.Container.factory(name="stream_coarse")
@@ -374,7 +379,7 @@ def stream_coarse(
             #     raise Exception("Only the finest level is supported for now")
 
             # module op to define odd of even iteration
-            #od_or_even = wp.module("odd_or_even", "even")
+            # od_or_even = wp.module("odd_or_even", "even")
 
             def ll_stream_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
@@ -405,7 +410,7 @@ def cl_stream_coarse(index: typing.Any):
 
                                 #  if (!pin.hasChildren(cell, dir)) {
                                 if not wp.neon_has_child(f_0_pn, index, pull_direction):
-                                    #if (nghType.mIsValid) {
+                                    # if (nghType.mIsValid) {
                                     # NOTHING as taken  care after
                                     # } else if (pin.hasParent(cell) && !(dir.x == 0 && dir.y == 0 && dir.z == 0)) {
                                     if wp.neon_has_parent(f_0_pn, index):
@@ -415,17 +420,19 @@ def cl_stream_coarse(index: typing.Any):
                                             # if is_valid:
                                             #     #_f_post_stream[l] = uncle_val
                                             #     # HERE DB
-                                            _f_post_stream[l] =  self.compute_dtype(0.0)
+                                            _f_post_stream[l] = self.compute_dtype(0.0)
                                 else:
                                     is_valid = wp.bool(False)
-                                    read_accumulate_date = wp.neon_read_ngh(f_1_pn, index, pull_direction, l, self.compute_dtype(0),is_valid)
+                                    read_accumulate_date = wp.neon_read_ngh(f_1_pn, index, pull_direction, l, self.compute_dtype(0), is_valid)
                                     if is_valid:
-                                        #_f_post_stream[l] = read_accumulate_date * self.compute_dtype(0.5)
+                                        # _f_post_stream[l] = read_accumulate_date * self.compute_dtype(0.5)
                                         # HERE DB
                                         _f_post_stream[l] = self.compute_dtype(0.0)
 
                             # do non mres post-streaming corrections
-                            _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_stream, True)
+                            _f_post_stream = apply_bc(
+                                index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_stream, True
+                            )
 
                             for l in range(self.velocity_set.q):
                                 wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
@@ -436,31 +443,32 @@ def cl_stream_coarse(index: typing.Any):
             return ll_stream_coarse
 
         return None, {
-            #"single_step_finest": single_step_finest,
+            # "single_step_finest": single_step_finest,
             "collide_coarse": collide_coarse,
-            "stream_coarse": stream_coarse}
+            "stream_coarse": stream_coarse,
+        }
 
-    def get_containers(self, target_level,  f_0, f_1, bc_mask, missing_mask,  omega, timestep):
-        containers = {'even': {}, 'odd': {}}
+    def get_containers(self, target_level, f_0, f_1, bc_mask, missing_mask, omega, timestep):
+        containers = {"even": {}, "odd": {}}
         _, container = self._construct_neon()
         for key in container.keys():
-            containers['odd'][key] = container[key](target_level, f_1, f_0, bc_mask, missing_mask, omega, 1)
-            containers['even'][key] = container[key](target_level, f_0, f_1, bc_mask, missing_mask, omega, 0)
+            containers["odd"][key] = container[key](target_level, f_1, f_0, bc_mask, missing_mask, omega, 1)
+            containers["even"][key] = container[key](target_level, f_0, f_1, bc_mask, missing_mask, omega, 0)
         return containers
 
     def init_containers(self):
-        self.containers=None
+        self.containers = None
         _, self.containers = self._construct_neon()
 
-    def launch_container(self, streamId, op_name, mres_level, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
+    def launch_container(self, streamId, op_name, mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep):
         self.containers[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep).run(0)
 
-    def add_to_app(self, app:typing.List, op_name, mres_level, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
+    def add_to_app(self, app: typing.List, op_name, mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep):
         app.append(self.containers[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep))
 
     @Operator.register_backend(ComputeBackend.NEON)
-    def neon_launch(self, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
-        #if self.c is None:
+    def neon_launch(self, f_0, f_1, bc_mask, missing_mask, omega, timestep):
+        # if self.c is None:
         #    self.c = self.neon_container(f_0, f_1, bc_mask, missing_mask, timestep)
         # c = None
         # if self.odd_or_even == 'even':
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 1385900d..d3a389e8 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -35,9 +35,6 @@ def __init__(
         force_vector=None,
     ):
         super().__init__(grid, boundary_conditions)
-        self.odd_or_even='even'
-        self.c_even = None
-        self.c_odd = None
 
         # Construct the collision operator
         if collision_type == "BGK":
@@ -92,6 +89,7 @@ def prepare_fields(self, initializer=None):
         if True:
             import xlb.velocity_set
             from xlb.operator.macroscopic import Macroscopic
+
             # macro = Macroscopic(
             #     compute_backend=ComputeBackend.NEON,
             #     precision_policy=self.precision_policy,
@@ -119,11 +117,11 @@ def prepare_fields(self, initializer=None):
         bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, bc_mask, missing_mask, xlb_grid=self.grid)
         # Initialize auxiliary data if needed
         f_0, f_1 = self._initialize_auxiliary_data(self.boundary_conditions, f_0, f_1, bc_mask, missing_mask)
-        bc_mask.update_host(0)
-        missing_mask.update_host(0)
+        # bc_mask.update_host(0)
+        # missing_mask.update_host(0)
         wp.synchronize()
-        #bc_mask.export_vti("bc_mask.vti", 'bc_mask')
-        #missing_mask.export_vti("missing_mask.vti", 'missing_mask')
+        # bc_mask.export_vti("bc_mask.vti", 'bc_mask')
+        # missing_mask.export_vti("missing_mask.vti", 'missing_mask')
 
         return f_0, f_1, bc_mask, missing_mask
 
@@ -366,7 +364,7 @@ def _construct_neon(self):
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
         _missing_mask_vec = wp.vec(self.velocity_set.q, dtype=wp.uint8)
         _opp_indices = self.velocity_set.opp_indices
-        #_cast_to_store_dtype = self.store_dtype()
+        # _cast_to_store_dtype = self.store_dtype()
 
         # Read the list of bc_to_id created upon instantiation
         bc_to_id = boundary_condition_registry.bc_to_id
@@ -431,24 +429,26 @@ def neon_get_thread_data(
             return _f0_thread, _f1_thread, _missing_mask
 
         import typing
+
         @neon.Container.factory(name="nse_stepper")
         def container(
-                f_0_fd: Any,
-                f_1_fd: Any,
-                bc_mask_fd: Any,
-                missing_mask_fd: Any,
-                omega: Any,
-                timestep: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+            timestep: int,
         ):
             cast_to_store_dtype = self.store_dtype
+
             def nse_stepper_ll(loader: neon.Loader):
                 loader.set_grid(bc_mask_fd.get_grid())
 
-                f_0_pn=(loader.get_read_handle(f_0_fd))
-                bc_mask_pn=loader.get_read_handle(bc_mask_fd)
-                missing_mask_pn=loader.get_read_handle(missing_mask_fd)
+                f_0_pn = loader.get_read_handle(f_0_fd)
+                bc_mask_pn = loader.get_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_read_handle(missing_mask_fd)
 
-                f_1_pn =loader.get_write_handle(f_1_fd)
+                f_1_pn = loader.get_write_handle(f_1_fd)
 
                 @wp.func
                 def nse_stepper_cl(index: typing.Any):
@@ -469,7 +469,9 @@ def nse_stepper_cl(index: typing.Any):
                     _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
 
                     # Apply post-collision boundary conditions
-                    _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False)
+                    _f_post_collision = apply_bc(
+                        index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False
+                    )
 
                     # Store the result in f_1
                     for l in range(self.velocity_set.q):
@@ -479,37 +481,15 @@ def nse_stepper_cl(index: typing.Any):
                                 if _missing_mask[l] == wp.uint8(1):
                                     wp.neon_write(f_0_pn, index, _opp_indices[l], _f1_thread[_opp_indices[l]])
                         wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+
                 loader.declare_kernel(nse_stepper_cl)
+
             return nse_stepper_ll
 
         return None, container
 
-    def get_containers(self, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
-        _, container = self._construct_neon()
-        return {'even': container(f_0, f_1,  bc_mask, missing_mask, omega, 0),
-                'odd': container(f_1, f_0, bc_mask, missing_mask, omega, 1)}
-
     @Operator.register_backend(ComputeBackend.NEON)
-    def neon_launch(self, f_0, f_1, bc_mask, missing_mask,  omega, timestep):
-        #if self.c is None:
-        #    self.c = self.neon_container(f_0, f_1, bc_mask, missing_mask, timestep)
-        # c = None
-        # if self.odd_or_even == 'even':
-        #     c = self.c_even
-        # else:
-        #     c = self.c_odd
-        #
-        # if c is None:
-        #     pass
+    def neon_launch(self, f_0, f_1, bc_mask, missing_mask, omega, timestep):
         c = self.neon_container(f_0, f_1, bc_mask, missing_mask, omega, timestep)
         c.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
-        #
-        # if self.odd_or_even == 'even':
-        #     c = self.c_even
-        # else:
-        #     c = self.c_odd
-        #
-        # if self.odd_or_even == 'even':
-        #     self.odd_or_even = 'odd'
-
         return f_0, f_1
diff --git a/xlb/operator/stream/stream.py b/xlb/operator/stream/stream.py
index 2c6a0bc7..8626d101 100644
--- a/xlb/operator/stream/stream.py
+++ b/xlb/operator/stream/stream.py
@@ -121,7 +121,7 @@ def _construct_neon(self):
         # Construct the funcional to get streamed indices
         @wp.func
         def functional(
-            f:Any,
+            f: Any,
             index: Any,
         ):
             # Pull the distribution function
@@ -132,9 +132,7 @@ def functional(
                 # for d in range(self.velocity_set.d):
                 #     pull_index[d] = index[d] - _c[d, l]
 
-                ngh = wp.neon_ngh_idx(wp.int8(-_c[0, l]),
-                                      wp.int8(-_c[1, l]),
-                                      wp.int8(-_c[2, l]))
+                ngh = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
 
                 unused_is_valid = wp.bool(False)
                 _f[l] = wp.neon_read_ngh(f, index, ngh, l, self.compute_dtype(0), unused_is_valid)
@@ -163,4 +161,4 @@ def functional(
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, f_0, f_1):
         # rise exception as this feature is not implemented yet
-        raise NotImplementedError("This feature is not implemented in NEON yet.")
+        raise NotImplementedError("This feature is not implemented in XLB with the NEON backend yet.")

From 8cfc636bf326785c8d56ab6174394efaa73593b6 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 20 May 2025 14:13:09 -0400
Subject: [PATCH 039/208] added dGrid and mGrid handling in indices masker

---
 examples/performance/mlups_3d_neon.py         |   8 +-
 examples/performance/mlups_3d_neon_sovler.py  | 165 ------------------
 xlb/helper/nse_solver.py                      |   2 +-
 .../indices_boundary_masker.py                |  88 +++-------
 xlb/operator/stepper/nse_multires_stepper.py  |  60 ++-----
 5 files changed, 44 insertions(+), 279 deletions(-)
 delete mode 100644 examples/performance/mlups_3d_neon_sovler.py

diff --git a/examples/performance/mlups_3d_neon.py b/examples/performance/mlups_3d_neon.py
index 4496287e..932c1567 100644
--- a/examples/performance/mlups_3d_neon.py
+++ b/examples/performance/mlups_3d_neon.py
@@ -6,12 +6,6 @@
 import warp as wp
 import numpy as np
 
-# add a directory to the PYTHON PATH
-import sys
-
-sys.path.append("/home/max/repos/neon/warping/neon_warp_testing/neon_py_bindings/py/")
-import neon
-
 from xlb.compute_backend import ComputeBackend
 from xlb.precision_policy import PrecisionPolicy
 from xlb.grid import grid_factory
@@ -97,7 +91,7 @@ def run(macro, compute_backend, precision_policy, grid_shape, num_steps):
     start_time = time.time()
 
     for i in range(num_steps):
-        f_0, f_1 = stepper(f_0, f_1, bc_mask, missing_mask, omega, i)
+        f_0, f_1 = stepper(f_0, f_1, bc_mask, missing_mask, omega, 0)
         f_0, f_1 = f_1, f_0
 
         # if i % 2 == 0 or i == num_steps - 1:
diff --git a/examples/performance/mlups_3d_neon_sovler.py b/examples/performance/mlups_3d_neon_sovler.py
deleted file mode 100644
index ff72e366..00000000
--- a/examples/performance/mlups_3d_neon_sovler.py
+++ /dev/null
@@ -1,165 +0,0 @@
-from warp.examples.fem.example_convection_diffusion import velocity
-
-import xlb
-import argparse
-import time
-import warp as wp
-import numpy as np
-
-# add a directory to the PYTHON PATH
-import sys
-
-sys.path.append("/home/max/repos/neon/warping/neon_warp_testing/neon_py_bindings/py/")
-import neon
-
-from xlb.compute_backend import ComputeBackend
-from xlb.precision_policy import PrecisionPolicy
-from xlb.grid import grid_factory
-from xlb.operator.stepper import IncompressibleNavierStokesStepper
-from xlb.operator.boundary_condition import FullwayBounceBackBC, EquilibriumBC
-from xlb.distribute import distribute
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation (BGK)")
-    # Positional arguments
-    parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid")
-    parser.add_argument("num_steps", type=int, help="Timestep for the simulation")
-    parser.add_argument("backend", type=str, help="Backend for the simulation (jax, warp or neon)")
-    parser.add_argument("precision", type=str, help="Precision for the simulation (e.g., fp32/fp32)")
-
-    # Optional arguments
-    parser.add_argument("--num_devices", type=int, default=0, help="Number of devices for the simulation (default: 0)")
-    parser.add_argument("--velocity_set", type=str, default="D3Q19", help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)")
-
-    return parser.parse_args()
-
-
-def setup_simulation(args):
-    backend = None
-    if args.backend == "jax":
-        backend = ComputeBackend.JAX
-    elif args.backend == "warp":
-        backend = ComputeBackend.WARP
-    elif args.backend == "neon":
-        backend = ComputeBackend.NEON
-    if backend is None:
-        raise ValueError("Invalid backend")
-
-    precision_policy_map = {
-        "fp32/fp32": PrecisionPolicy.FP32FP32,
-        "fp64/fp64": PrecisionPolicy.FP64FP64,
-        "fp64/fp32": PrecisionPolicy.FP64FP32,
-        "fp32/fp16": PrecisionPolicy.FP32FP16,
-    }
-    precision_policy = precision_policy_map.get(args.precision)
-    if precision_policy is None:
-        raise ValueError("Invalid precision")
-
-    velocity_set = None
-    if args.velocity_set == "D3Q19":
-        velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
-    elif args.velocity_set == "D3Q27":
-        velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, backend=backend)
-    if velocity_set is None:
-        raise ValueError("Invalid velocity set")
-
-    xlb.init(
-        velocity_set=velocity_set,
-        default_backend=backend,
-        default_precision_policy=precision_policy,
-    )
-
-    return backend, precision_policy
-
-
-def run(backend, precision_policy, grid_shape, num_steps):
-    # Create grid and setup boundary conditions
-    velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
-    grid = grid_factory(grid_shape, velocity_set=velocity_set)
-    box = grid.bounding_box_indices()
-    box_no_edge = grid.bounding_box_indices(remove_edges=True)
-    lid = box_no_edge["top"]
-    walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in range(len(grid.shape))]
-    walls = np.unique(np.array(walls), axis=-1).tolist()
-
-    prescribed_vel = 0.05
-
-    boundary_conditions = [EquilibriumBC(rho=1.0, u=(prescribed_vel, 0.0, 0.0), indices=lid), FullwayBounceBackBC(indices=walls)]
-
-    # Create stepper
-    stepper = IncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="BGK")
-
-    Re = 10000.0
-    clength = grid_shape[0] - 1
-    visc = prescribed_vel * clength / Re
-    omega = 1.0 / (3.0 * visc + 0.5)
-
-    # # Initialize fields and run simulation
-    # omega = 1.0
-
-    sim = xlb.helper.nse_solver.Nse_simulation(grid, velocity_set, stepper, omega)
-    print("start timing")
-    start_time = time.time()
-
-    for i in range(num_steps):
-        sim.step()
-        if i % 500 == 0:
-            sim.export_macroscopic("u_lid_driven_cavity_")
-    wp.synchronize()
-    t = time.time() - start_time
-
-    sim.export_macroscopic("u_lid_driven_cavity_")
-    return t
-
-
-def calculate_mlups(cube_edge, num_steps, elapsed_time):
-    total_lattice_updates = cube_edge**3 * num_steps
-    mlups = (total_lattice_updates / elapsed_time) / 1e6
-    return mlups
-
-
-def post_process(macro, rho, u, f_0, i):
-    # Write the results. We'll use JAX backend for the post-processing
-    # import jax.numpy as jnp
-    # if not isinstance(f_0, jnp.ndarray):
-    #     # If the backend is warp, we need to drop the last dimension added by warp for 2D simulations
-    #     f_0 = wp.to_jax(f_0)[..., 0]
-    # else:
-    #     f_0 = f_0
-    rho, u = macro(f_0, rho, u)
-    wp.synchronize()
-    u.update_host(0)
-    rho.update_host(0)
-    wp.synchronize()
-    u.export_vti(f"u_lid_driven_cavity_{i}.vti", "u")
-    rho.export_vti(f"rho_lid_driven_cavity_{i}.vti", "rho")
-
-    pass
-
-    # # remove boundary cells
-    # rho = rho[:, 1:-1, 1:-1, 1:-1]
-    # u = u[:, 1:-1, 1:-1, 1:-1]
-    # u_magnitude = (u[0] ** 2 + u[1] ** 2) ** 0.5
-    #
-    # fields = {"rho": rho[0], "u_x": u[0], "u_y": u[1], "u_magnitude": u_magnitude}
-    #
-    # # save_fields_vtk(fields, timestep=i, prefix="lid_driven_cavity")
-    # ny=fields["u_magnitude"].shape[1]
-    # from xlb.utils import  save_image
-    # save_image(fields["u_magnitude"][:, ny//2, :], timestep=i, prefix="lid_driven_cavity")
-
-
-def main():
-    args = parse_arguments()
-    backend, precision_policy = setup_simulation(args)
-    grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
-    elapsed_time = run(backend, precision_policy, grid_shape, args.num_steps)
-    mlups = calculate_mlups(args.cube_edge, args.num_steps, elapsed_time)
-
-    print(f"Simulation completed in {elapsed_time:.2f} seconds")
-    print(f"MLUPs: {mlups:.2f}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/xlb/helper/nse_solver.py b/xlb/helper/nse_solver.py
index 7e143586..075f6ab3 100644
--- a/xlb/helper/nse_solver.py
+++ b/xlb/helper/nse_solver.py
@@ -36,7 +36,7 @@ def create_nse_fields(
     # Create fields
     f_0 = grid.create_field(cardinality=velocity_set.q, dtype=precision_policy.store_precision)
     f_1 = grid.create_field(cardinality=velocity_set.q, dtype=precision_policy.store_precision)
-    missing_mask = grid.create_field(cardinality=velocity_set.q, dtype=Precision.BOOL)
+    missing_mask = grid.create_field(cardinality=velocity_set.q, dtype=Precision.UINT8)
     bc_mask = grid.create_field(cardinality=1, dtype=Precision.UINT8)
 
     return grid, f_0, f_1, missing_mask, bc_mask
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 1740f689..4fb22180 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -157,8 +157,8 @@ def kernel(
 
         return None, kernel
 
-    @Operator.register_backend(ComputeBackend.WARP)
-    def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None, xlb_grid=None):
+    # a helper for this operator
+    def _prepare_warp_kernel_inputs(self, bclist, bc_mask):
         # Pre-allocate arrays with maximum possible size
         max_size = sum(len(bc.indices[0]) if isinstance(bc.indices, list) else bc.indices.shape[1] for bc in bclist if bc.indices is not None)
         indices = np.zeros((3, max_size), dtype=np.int32)
@@ -195,19 +195,26 @@ def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None, x
             bc.__dict__.pop("indices", None)
 
         # Trim arrays to actual size
-        indices = indices[:, :current_index]
-        id_numbers = id_numbers[:current_index]
-        is_interior = is_interior[:current_index]
+        total_index = current_index
+        indices = indices[:, :total_index]
+        id_numbers = id_numbers[:total_index]
+        is_interior = is_interior[:total_index]
 
         # Convert to Warp arrays
         wp_indices = wp.array(indices, dtype=wp.int32)
         wp_id_numbers = wp.array(id_numbers, dtype=wp.uint8)
         wp_is_interior = wp.array(is_interior, dtype=wp.bool)
+        return total_index, wp_indices, wp_id_numbers, wp_is_interior
+
+    @Operator.register_backend(ComputeBackend.WARP)
+    def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None, xlb_grid=None):
+        # prepare warp kernel inputs
+        total_index, wp_indices, wp_id_numbers, wp_is_interior = self._prepare_warp_kernel_inputs(bclist, bc_mask)
 
         # Launch the warp kernel
         wp.launch(
             self.warp_kernel,
-            dim=current_index,
+            dim=total_index,
             inputs=[
                 wp_indices,
                 wp_id_numbers,
@@ -225,65 +232,21 @@ def _construct_neon(self):
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None, xlb_grid=None):
+        import neon, typing
+
         # Pre-allocate arrays with maximum possible size
         velocity_set = xlb_grid._get_velocity_set()
         missing_mask_warp = xlb_grid._create_warp_field(cardinality=velocity_set.q, dtype=Precision.BOOL)
         bc_mask_warp = xlb_grid._create_warp_field(cardinality=1, dtype=Precision.UINT8)
         _, warp_kernel = self._construct_warp()
 
-        max_size = sum(len(bc.indices[0]) if isinstance(bc.indices, list) else bc.indices.shape[1] for bc in bclist if bc.indices is not None)
-        indices = np.zeros((3, max_size), dtype=np.int32)
-        id_numbers = np.zeros(max_size, dtype=np.uint8)
-        is_interior = np.zeros(max_size, dtype=bool)
-
-        current_index = 0
-        for bc in bclist:
-            assert bc.indices is not None, f'Please specify indices associated with the {bc.__class__.__name__} BC using keyword "indices"!'
-            assert bc.mesh_vertices is None, f"Please use MeshBoundaryMasker operator if {bc.__class__.__name__} is imposed on a mesh (e.g. STL)!"
-
-            bc_indices = np.asarray(bc.indices)
-            num_indices = bc_indices.shape[1]
-
-            # Ensure indices are 3D
-            if bc_indices.shape[0] == 2:
-                bc_indices = np.vstack([bc_indices, np.zeros(num_indices, dtype=int)])
-
-            # Add indices to the pre-allocated array
-            indices[:, current_index : current_index + num_indices] = bc_indices
-
-            # Set id numbers
-            id_numbers[current_index : current_index + num_indices] = bc.id
-
-            # Set is_interior flags
-            if bc.needs_padding:
-                is_interior[current_index : current_index + num_indices] = self.are_indices_in_interior(bc_indices, bc_mask_warp[0].shape)
-            else:
-                is_interior[current_index : current_index + num_indices] = False
-
-            current_index += num_indices
-
-            # Remove indices from BC objects
-            bc.__dict__.pop("indices", None)
-
-        # Trim arrays to actual size
-        indices = indices[:, :current_index]
-        id_numbers = id_numbers[:current_index]
-        is_interior = is_interior[:current_index]
-
-        # Convert to Warp arrays
-        wp_indices = wp.array(indices, dtype=wp.int32)
-        wp_id_numbers = wp.array(id_numbers, dtype=wp.uint8)
-        wp_is_interior = wp.array(is_interior, dtype=wp.bool)
-
-        if start_index is None:
-            start_index = wp.vec3i(0, 0, 0)
-        else:
-            start_index = wp.vec3i(*start_index)
+        # prepare warp kernel inputs
+        total_index, wp_indices, wp_id_numbers, wp_is_interior = self._prepare_warp_kernel_inputs(bclist, bc_mask)
 
         # Launch the warp kernel
         wp.launch(
             warp_kernel,
-            dim=current_index,
+            dim=total_index,
             inputs=[
                 wp_indices,
                 wp_id_numbers,
@@ -294,8 +257,6 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None, x
         )
         wp.synchronize()
 
-        import neon, typing
-
         @neon.Container.factory("")
         def container(
             bc_mask_warp: typing.Any,
@@ -304,10 +265,15 @@ def container(
             missing_mask_field: typing.Any,
         ):
             def loading_step(loader: neon.Loader):
-                loader.set_mres_grid(bc_mask.get_grid(), level=0)
-
-                bc_mask_hdl = loader.get_mres_write_handle(bc_mask_field)
-                missing_mask_hdl = loader.get_mres_write_handle(missing_mask_field)
+                grid_name = bc_mask_field.get_grid().get_name()
+                if grid_name == "mGrid":
+                    loader.set_mres_grid(bc_mask_field.get_grid(), level=0)
+                    bc_mask_hdl = loader.get_mres_write_handle(bc_mask_field)
+                    missing_mask_hdl = loader.get_mres_write_handle(missing_mask_field)
+                else:
+                    loader.set_grid(bc_mask_field.get_grid())
+                    bc_mask_hdl = loader.get_write_handle(bc_mask_field)
+                    missing_mask_hdl = loader.get_write_handle(missing_mask_field)
 
                 @wp.func
                 def masker(gridIdx: typing.Any):
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index ab4dc26a..77365bc5 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -390,13 +390,13 @@ def device(index: typing.Any):
 
         @neon.Container.factory(name="stream_coarse_step_ABC")
         def stream_coarse_step_ABC(
-                level: int,
-                f_0_fd: Any,
-                f_1_fd: Any,
-                bc_mask_fd: Any,
-                missing_mask_fd: Any,
-                omega: Any,
-                timestep: int,
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+            timestep: int,
         ):
             num_levels = f_0_fd.get_grid().get_num_levels()
 
@@ -421,7 +421,6 @@ def ll_stream_coarse(loader: neon.Loader):
                 coalescence_factor_fd = omega
                 coalescence_factor_pn = loader.get_mres_read_handle(coalescence_factor_fd)
 
-
                 @wp.func
                 def cl_stream_coarse(index: typing.Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
@@ -435,10 +434,7 @@ def cl_stream_coarse(index: typing.Any):
 
                     # do stream normally
                     _missing_mask = _missing_mask_vec()
-                    _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn,
-                                                                                 f_1_pn,
-                                                                                 missing_mask_pn,
-                                                                                 index)
+                    _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
                     _f_post_collision = _f0_thread
                     _f_post_stream = self.stream.neon_functional(f_0_pn, index)
 
@@ -447,13 +443,10 @@ def cl_stream_coarse(index: typing.Any):
                             # HERE, we skip the center direction
                             continue
 
-                        pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]),
-                                                         wp.int8(-_c[1, l]),
-                                                         wp.int8(-_c[2, l]))
+                        pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
 
                         has_ngh_at_same_level = wp.bool(False)
-                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0),
-                                                       has_ngh_at_same_level)
+                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
 
                         # if (!pin.hasChildren(cell, dir)) {
                         if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
@@ -464,9 +457,9 @@ def cl_stream_coarse(index: typing.Any):
                                 if wp.neon_has_parent(f_0_pn, index):
                                     # YES halo cell on top of us
                                     has_a_courser_ngh = wp.bool(False)
-                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(f_0_pn, index, pull_direction, l,
-                                                                                self.compute_dtype(0),
-                                                                                has_a_courser_ngh)
+                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(
+                                        f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_courser_ngh
+                                    )
                                     if has_a_courser_ngh:
                                         # Full state:
                                         # NO finer ngh. in the pull direction (opposite of l)
@@ -493,36 +486,13 @@ def cl_stream_coarse(index: typing.Any):
                                 # -> **Coalescence**
                                 coalescence_factor = wp.neon_read(coalescence_factor_pn, index, l)
                                 accumulated = accumulated * coalescence_factor
-                                #wp.neon_write(f_1_pn, index, l, accumulated)
+                                # wp.neon_write(f_1_pn, index, l, accumulated)
                                 _f_post_stream[l] = accumulated
                             else:
                                 wp.print("ERRRRRRORRRRRRRRRRRRRR")
 
-
-
-
-
-
                     # do non mres post-streaming corrections
-                    _f_post_stream = apply_bc(
-                        index, timestep,
-                        _boundary_id,
-                        _missing_mask,
-                        f_0_pn, f_1_pn,
-                        _f_post_collision, _f_post_stream, True
-                    )
-
-
-
-
-
-
-
-
-
-
-
-
+                    _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
 
                     for l in range(self.velocity_set.q):
                         wp.neon_write(f_1_pn, index, l, _f_post_stream[l])

From 95ae79445e1673ea8f19146fa560c6a2648c9f45 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Wed, 21 May 2025 10:43:04 +0200
Subject: [PATCH 040/208] Update test

---
 .../3_levels_mlups_3d_multires_solver.py      | 30 ++++++++++---------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/examples/performance/3_levels_mlups_3d_multires_solver.py b/examples/performance/3_levels_mlups_3d_multires_solver.py
index e48a6f4e..d73db508 100644
--- a/examples/performance/3_levels_mlups_3d_multires_solver.py
+++ b/examples/performance/3_levels_mlups_3d_multires_solver.py
@@ -101,25 +101,27 @@ def get_peeled_np(level, width):
                 for k in range(m.z):
                     idx = neon.Index_3d(i, j, k)
                     val = 0
-                    if peel(m, idx, m.x / width, True):
+                    if peel(m, idx, width, True):
                         val = 1
                     mask[i, j, k] = val
         return mask
 
-    levels = []
 
-    l0 = get_peeled_np(0, 17)
-    l1 = get_peeled_np(1, 7)
-    l2 = get_peeled_np(2, 4)
-
-    num_levels = 4
-    lastLevel = num_levels -1
-    divider = 2**lastLevel
-    m = neon.Index_3d(dim.x // divider +1, dim.y // divider+1, dim.z // divider+1)
-    lastLevel = np.ones((m.x, m.y, m.z), dtype=int)
-    lastLevel = np.ascontiguousarray(lastLevel, dtype=np.int32)
-
-    levels = [l0, l1, l2, lastLevel]
+    def get_levels(num_levels):
+        levels = []
+        for i in range(num_levels-1):
+            l = get_peeled_np(i, 8)
+            levels.append(l)
+        lastLevel = num_levels - 1
+        divider = 2 ** lastLevel
+        m = neon.Index_3d(dim.x // divider + 1, dim.y // divider + 1, dim.z // divider + 1)
+        lastLevel = np.ones((m.x, m.y, m.z), dtype=int)
+        lastLevel = np.ascontiguousarray(lastLevel, dtype=np.int32)
+        levels.append(lastLevel)
+        return levels
+
+    num_levels = 5
+    levels = get_levels(num_levels)
 
     grid = multires_grid_factory(grid_shape, velocity_set=velocity_set,
                                  sparsity_pattern_list=levels,

From acfc54d5828941803fc7b55120e30e684daf2247 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Thu, 22 May 2025 19:29:05 -0400
Subject: [PATCH 041/208] Added a multires_boundary_masker and made sure
 results are correct.

---
 .gitignore                                    |   4 +
 .../3_levels_mlups_3d_multires_solver.py      | 111 ++++++++++++------
 xlb/grid/grid.py                              |  24 ++--
 xlb/grid/multires_grid.py                     |  66 ++---------
 xlb/grid/neon_grid.py                         |  17 ++-
 xlb/operator/boundary_masker/__init__.py      |   1 +
 .../indices_boundary_masker.py                |  74 +++++-------
 .../multires_boundary_masker.py               | 109 +++++++++++++++++
 .../mulltires_quadratic_equilibrium.py        |   3 -
 .../equilibrium/quadratic_equilibrium.py      |   3 -
 xlb/operator/stepper/nse_multires_stepper.py  |  23 ++--
 xlb/velocity_set/d3q19.py                     |   1 -
 xlb/velocity_set/velocity_set.py              |  21 ++++
 13 files changed, 276 insertions(+), 181 deletions(-)
 create mode 100644 xlb/operator/boundary_masker/multires_boundary_masker.py

diff --git a/.gitignore b/.gitignore
index aecc7a13..b65678c8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -156,3 +156,7 @@ dist/
 build/
 *.egg-info/
 *.dot
+
+# Ignore h5 and xmf formats
+*.h5
+*.xmf
diff --git a/examples/performance/3_levels_mlups_3d_multires_solver.py b/examples/performance/3_levels_mlups_3d_multires_solver.py
index 2cef2c49..a9611f44 100644
--- a/examples/performance/3_levels_mlups_3d_multires_solver.py
+++ b/examples/performance/3_levels_mlups_3d_multires_solver.py
@@ -3,11 +3,6 @@
 import time
 import warp as wp
 import numpy as np
-
-# add a directory to the PYTHON PATH
-import sys
-
-# sys.path.append('/home/max/repos/neon/warping/neon_warp_testing/neon_py_bindings/py/')
 import neon
 
 from xlb.compute_backend import ComputeBackend
@@ -15,25 +10,6 @@
 from xlb.grid import multires_grid_factory
 from xlb.operator.stepper import MultiresIncompressibleNavierStokesStepper
 from xlb.operator.boundary_condition import FullwayBounceBackBC, EquilibriumBC
-from xlb.distribute import distribute
-
-
-import time
-import numpy as np
-import vtk
-from vtk.util.numpy_support import numpy_to_vtk, numpy_to_vtkIdTypeArray
-from pathlib import Path
-import open3d as o3d
-from tabulate import tabulate
-import h5py
-import cupy as cp  # Added for GPU-accelerated array operations
-
-# Import and initialize NVIDIA Warp for GPU acceleration
-import warp as wp
-
-wp.init()
-DEVICE = "cuda"
-
 
 def parse_arguments():
     parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation (BGK)")
@@ -88,10 +64,25 @@ def setup_simulation(args):
     return compute_backend, precision_policy
 
 
-def run(compute_backend, precision_policy, grid_shape, num_steps):
-    # Create grid and setup boundary conditions
-    velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=compute_backend)
-
+# def construct_indices_per_level(grid_shape_finest, indices_finest, active_voxels_mask_per_level, level_origins):
+#     # TODO: HS: This is not the efficient way of doing this. I need to write a Warp operator for this purpose
+#     num_levels = len(active_voxels_mask_per_level)
+#     indices_list = []
+#     for level in range(num_levels):
+#         refinement = 2**level
+#         grid_shape = tuple(x // refinement for x in grid_shape_finest)
+#         mask = np.zeros(grid_shape, dtype=bool)
+#         ox, oy, oz = level_origins[level]
+#         Lx, Ly, Lz = active_voxels_mask_per_level[level].shape
+#         mask[ox : ox + Lx, oy : oy + Ly, oz : oz + Lz] = active_voxels_mask_per_level[level]
+#         indices_per_level = (np.array(indices_finest) // refinement)[:, ::refinement]
+#         mask_per_level = mask[tuple(indices_per_level)]
+#         active_bc_indices_per_level = indices_per_level[:, mask_per_level].tolist()
+#         indices_list.append(active_bc_indices_per_level)
+#     return indices_list
+
+
+def problem1(grid_shape, velocity_set):
     def peel(dim, idx, peel_level, outwards):
         if outwards:
             xIn = idx.x <= peel_level or idx.x >= dim.x - 1 - peel_level
@@ -125,7 +116,6 @@ def get_peeled_np(level, width):
                     mask[i, j, k] = val
         return mask
 
-
     def get_levels(num_levels):
         levels = []
         for i in range(num_levels-1):
@@ -139,7 +129,7 @@ def get_levels(num_levels):
         levels.append(lastLevel)
         return levels
 
-    num_levels = 5
+    num_levels = 4
     levels = get_levels(num_levels)
 
     grid = multires_grid_factory(grid_shape, velocity_set=velocity_set,
@@ -151,9 +141,57 @@ def get_levels(num_levels):
     lid = box_no_edge["top"]
     walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in range(len(grid.shape))]
     walls = np.unique(np.array(walls), axis=-1).tolist()
+    # convert bc indices to a list of list, where the first entry of the list corresponds to the finest level
+    lid = [lid, [], [], []]
+    walls = [walls, [], [], []]
+    return grid, lid, walls
+
+
+def problem2(grid_shape, velocity_set):
+    # Example 2: Coarsest at the edges (2 level only)
+    num_levels = 2
+    level_1 = np.ones((grid_shape[0] // 2, grid_shape[1] // 2, grid_shape[2] // 2), dtype=int)
+    finestLevel = np.ones((40, 40, 40), dtype=int)
+    finestLevel = np.ascontiguousarray(finestLevel, dtype=np.int32)
+    levels = [finestLevel, level_1]
+    level_origins = [(44, 44, 44), (0, 0, 0)]
+
+    # Create the multires grid
+    grid = multires_grid_factory(
+        grid_shape,
+        velocity_set=velocity_set,
+        sparsity_pattern_list=levels,
+        sparsity_pattern_origins=[neon.Index_3d(*level_origins[lvl]) for lvl in range(num_levels)],
+    )
 
-    prescribed_vel = 0.1
+    box = grid.bounding_box_indices(shape=grid.level_to_shape(1))
+    box_no_edge = grid.bounding_box_indices(shape=grid.level_to_shape(1), remove_edges=True)
+    lid = box_no_edge["top"]
+    walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in range(len(grid.shape))]
+    walls = np.unique(np.array(walls), axis=-1).tolist()
+    # convert bc indices to a list of list, where the first entry of the list corresponds to the finest level
+    lid = [[], lid]
+    walls = [[], walls]
+    return grid, lid, walls
+
+
+def run(compute_backend, precision_policy, grid_shape, num_steps):
+    # Create grid and setup boundary conditions
+    velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=compute_backend)
+
+    # Convert indices to list of indices per level
+    # TODO: overlaps emerge if bc indices are orignally specified at the finest grid and they exist at the coarser levels
+    # levels_mask = [lvl.astype(bool) for lvl in levels]
+    # lid = construct_indices_per_level(grid_shape, lid, levels_mask, level_origins)
+    # walls = construct_indices_per_level(grid_shape, walls, levels_mask, level_origins)
 
+    # Example 1: fine to coarse
+    grid, lid, walls = problem1(grid_shape, velocity_set)
+
+    # Example 2: Coarse to fine:
+    # grid, lid, walls = problem2(grid_shape, velocity_set)
+
+    prescribed_vel = 0.1
     boundary_conditions = [
         EquilibriumBC(rho=1.0, u=(prescribed_vel, 0.0, 0.0), indices=lid),
         EquilibriumBC(rho=1.0, u=(0.0, 0.0, 0.0), indices=walls),
@@ -162,12 +200,12 @@ def get_levels(num_levels):
     # Create stepper
     stepper = MultiresIncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="BGK")
 
-    Re = 5000.0
+    # Re = 5000.0
 
-    clength = grid_shape[0] - 1
-    visc = prescribed_vel * clength / Re
-    omega = 1.0 / (3.0 * visc + 0.5)
-    # omega = 1.0
+    # clength = grid_shape[0] - 1
+    # visc = prescribed_vel * clength / Re
+    # omega = 1.0 / (3.0 * visc + 0.5)
+    omega = 1.0
 
     sim = xlb.helper.Nse_multires_simulation(grid, velocity_set, stepper, omega)
 
@@ -187,6 +225,7 @@ def get_levels(num_levels):
     print(f"Timing  {t}")
 
     # sim.export_macroscopic("u_lid_driven_cavity_")
+    num_levels = grid.count_levels
     return {"time": t, "num_levels": num_levels}
 
 
diff --git a/xlb/grid/grid.py b/xlb/grid/grid.py
index d4696dd4..7ae34684 100644
--- a/xlb/grid/grid.py
+++ b/xlb/grid/grid.py
@@ -37,23 +37,15 @@ def multires_grid_factory(
     sparsity_pattern_origins: List[neon.Index_3d] = [],
 ):
     compute_backend = compute_backend or DefaultConfig.default_backend
-    if compute_backend == ComputeBackend.WARP:
-        from xlb.grid.warp_grid import WarpGrid
-
-        raise ValueError(f"Compute backend {compute_backend} is not supported for multires grid")
-
     if compute_backend == ComputeBackend.NEON:
         from xlb.grid.multires_grid import NeonMultiresGrid
 
         return NeonMultiresGrid(
             shape=shape, velocity_set=velocity_set, sparsity_pattern_list=sparsity_pattern_list, sparsity_pattern_origins=sparsity_pattern_origins
         )
-
-    elif compute_backend == ComputeBackend.JAX:
+    else:
         raise ValueError(f"Compute backend {compute_backend} is not supported for multires grid")
 
-    raise ValueError(f"Compute backend {compute_backend} is not supported for multires grid")
-
 
 class Grid(ABC):
     def __init__(
@@ -73,7 +65,7 @@ def _initialize_backend(self):
     def get_compute_backend(self):
         return self.compute_backend
 
-    def bounding_box_indices(self, remove_edges=False):
+    def bounding_box_indices(self, shape=None, remove_edges=False):
         """
         This function calculates the indices of the bounding box of a 2D or 3D grid.
         The bounding box is defined as the set of grid points on the outer edge of the grid.
@@ -91,9 +83,13 @@ def bounding_box_indices(self, remove_edges=False):
         are numpy arrays of indices corresponding to each face.
         """
 
+        # If shape is not give, use self.shape
+        if shape is None:
+            shape = self.shape
+
         # Get the shape of the grid
         origin = np.array([0, 0, 0])
-        bounds = np.array(self.shape)
+        bounds = np.array(shape)
         if remove_edges:
             origin += 1
             bounds -= 1
@@ -102,11 +98,11 @@ def bounding_box_indices(self, remove_edges=False):
         dim = len(bounds)
 
         # Generate bounding box indices for each face
-        grid = np.indices(self.shape)
+        grid = np.indices(shape)
         boundingBoxIndices = {}
 
         if dim == 2:
-            nx, ny = self.shape
+            nx, ny = shape
             boundingBoxIndices = {
                 "bottom": grid[:, slice_x, 0],
                 "top": grid[:, slice_x, ny - 1],
@@ -114,7 +110,7 @@ def bounding_box_indices(self, remove_edges=False):
                 "right": grid[:, nx - 1, slice_y],
             }
         elif dim == 3:
-            nx, ny, nz = self.shape
+            nx, ny, nz = shape
             slice_z = slice(origin[2], bounds[2])
             boundingBoxIndices = {
                 "bottom": grid[:, slice_x, slice_y, 0].reshape(3, -1),
diff --git a/xlb/grid/multires_grid.py b/xlb/grid/multires_grid.py
index d434fdc9..b86fed2c 100644
--- a/xlb/grid/multires_grid.py
+++ b/xlb/grid/multires_grid.py
@@ -16,21 +16,19 @@ def __init__(
         sparsity_pattern_list: List[np.ndarray],
         sparsity_pattern_origins: List[neon.Index_3d],
     ):
-        from .warp_grid import WarpGrid
-
         self.bk = None
         self.dim = None
         self.grid = None
-        self.xlb_lattice = velocity_set
-        self.warp_grid = WarpGrid(shape)
+        self.velocity_set = velocity_set
         self.sparsity_pattern_list = sparsity_pattern_list
         self.sparsity_pattern_origins = sparsity_pattern_origins
         self.count_levels = len(sparsity_pattern_list)
+        self.refinement_factor = 2
 
         super().__init__(shape, ComputeBackend.NEON)
 
     def _get_velocity_set(self):
-        return self.xlb_lattice
+        return self.velocity_set
 
     def _initialize_backend(self):
         # FIXME@max: for now we hardcode the number of devices to 0
@@ -42,19 +40,16 @@ def _initialize_backend(self):
 
             self.dim = py_neon.Index_3d(self.shape[0], 1, self.shape[1])
             self.neon_stencil = []
-            for c_idx in range(len(self.xlb_lattice._c[0])):
-                xval = self.xlb_lattice._c[0][c_idx]
-                yval = self.xlb_lattice._c[1][c_idx]
+            for q in range(self.velocity_set.q):
+                xval, yval = self.velocity_set._c[:, q]
                 self.neon_stencil.append([xval, 0, yval])
 
         else:
             self.dim = neon.Index_3d(self.shape[0], self.shape[1], self.shape[2])
 
             self.neon_stencil = []
-            for c_idx in range(len(self.xlb_lattice._c[0])):
-                xval = self.xlb_lattice._c[0][c_idx]
-                yval = self.xlb_lattice._c[1][c_idx]
-                zval = self.xlb_lattice._c[2][c_idx]
+            for q in range(self.velocity_set.q):
+                xval, yval, zval = self.velocity_set._c[:, q]
                 self.neon_stencil.append([xval, yval, zval])
 
         self.bk = neon.Backend(runtime=neon.Backend.Runtime.stream, dev_idx_list=dev_idx_list)
@@ -95,47 +90,6 @@ def create_field(
     def get_neon_backend(self):
         return self.bk
 
-    def _create_warp_field(
-        self, cardinality: int, dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None, fill_value=None, ne_field=None
-    ):
-        print("WARNING: allocating warp fields for mres is temporary and only a work around!")
-        warp_field = self.warp_grid.create_field(cardinality, dtype, fill_value)
-        if ne_field is None:
-            return warp_field
-
-        _d = self.xlb_lattice.d
-
-        import typing
-
-        @neon.Container.factory(mame="cloning-warp")
-        def container(src_field: typing.Any, dst_field: typing.Any, cardinality: wp.int32):
-            def loading_step(loader: neon.Loader):
-                loader.declare_execution_scope(self.grid, level=0)
-                src_pn = loader.get_read_handel(src_field)
-
-                @wp.func
-                def cloning(gridIdx: typing.Any):
-                    cIdx = wp.neon_global_idx(src_pn, gridIdx)
-                    gx = wp.neon_get_x(cIdx)
-                    gy = wp.neon_get_y(cIdx)
-                    gz = wp.neon_get_z(cIdx)
-
-                    # TODO@Max - XLB is flattening the z dimension in 3D, while neon uses the y dimension
-                    if _d == 2:
-                        gy, gz = gz, gy
-
-                    for card in range(cardinality):
-                        value = wp.neon_read(src_pn, gridIdx, card)
-                        dst_field[card, gx, gy, gz] = value
-
-                loader.declare_kernel(cloning)
-
-            return loading_step
-
-        c = container(src_field=ne_field, dst_field=warp_field, cardinality=cardinality)
-        c.run(0)
-        wp.synchronize()
-        return warp_field
-
-    def get_neon_backend(self):
-        return self.bk
+    def level_to_shape(self, level):
+        # level = 0 corresponds to the finest level
+        return tuple(x // self.refinement_factor**level for x in self.shape)
diff --git a/xlb/grid/neon_grid.py b/xlb/grid/neon_grid.py
index 126ae627..31a08c30 100644
--- a/xlb/grid/neon_grid.py
+++ b/xlb/grid/neon_grid.py
@@ -14,13 +14,13 @@ def __init__(self, shape, velocity_set):
         self.bk = None
         self.dim = None
         self.grid = None
-        self.xlb_lattice = velocity_set
+        self.velocity_set = velocity_set
         self.warp_grid = WarpGrid(shape)
 
         super().__init__(shape, ComputeBackend.NEON)
 
     def _get_velocity_set(self):
-        return self.xlb_lattice
+        return self.velocity_set
 
     def _initialize_backend(self):
         # FIXME@max: for now we hardcode the number of devices to 0
@@ -32,19 +32,16 @@ def _initialize_backend(self):
 
             self.dim = py_neon.Index_3d(self.shape[0], 1, self.shape[1])
             self.neon_stencil = []
-            for c_idx in range(len(self.xlb_lattice._c[0])):
-                xval = self.xlb_lattice._c[0][c_idx]
-                yval = self.xlb_lattice._c[1][c_idx]
+            for q in range(self.velocity_set.q):
+                xval, yval = self.velocity_set._c[:, q]
                 self.neon_stencil.append([xval, 0, yval])
 
         else:
             self.dim = neon.Index_3d(self.shape[0], self.shape[1], self.shape[2])
 
             self.neon_stencil = []
-            for c_idx in range(len(self.xlb_lattice._c[0])):
-                xval = self.xlb_lattice._c[0][c_idx]
-                yval = self.xlb_lattice._c[1][c_idx]
-                zval = self.xlb_lattice._c[2][c_idx]
+            for q in range(self.velocity_set.q):
+                xval, yval, zval = self.velocity_set._c[:, q]
                 self.neon_stencil.append([xval, yval, zval])
 
         self.bk = neon.Backend(runtime=neon.Backend.Runtime.stream, dev_idx_list=dev_idx_list)
@@ -77,7 +74,7 @@ def _create_warp_field(
         if ne_field is None:
             return warp_field
 
-        _d = self.xlb_lattice.d
+        _d = self.velocity_set.d
 
         import typing
 
diff --git a/xlb/operator/boundary_masker/__init__.py b/xlb/operator/boundary_masker/__init__.py
index 3417c3c8..c76ce13c 100644
--- a/xlb/operator/boundary_masker/__init__.py
+++ b/xlb/operator/boundary_masker/__init__.py
@@ -1,2 +1,3 @@
 from xlb.operator.boundary_masker.indices_boundary_masker import IndicesBoundaryMasker
 from xlb.operator.boundary_masker.mesh_boundary_masker import MeshBoundaryMasker
+from xlb.operator.boundary_masker.multires_boundary_masker import MultiresBoundaryMasker
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 4fb22180..379300b1 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -1,11 +1,14 @@
-import numpy as np
-import warp as wp
+from typing import Any
+
 import jax
 import jax.numpy as jnp
+import numpy as np
+import warp as wp
+
 from xlb.compute_backend import ComputeBackend
+from xlb.grid import grid_factory
 from xlb.operator.operator import Operator
 from xlb.operator.stream.stream import Stream
-from xlb.grid import grid_factory
 from xlb.precision_policy import Precision
 
 
@@ -117,7 +120,7 @@ def kernel(
             id_number: wp.array1d(dtype=wp.uint8),
             is_interior: wp.array1d(dtype=wp.bool),
             bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
+            missing_mask: wp.array4d(dtype=wp.uint8),
         ):
             # Get the index of indices
             ii = wp.tid()
@@ -147,12 +150,12 @@ def kernel(
                     # These directions will have missing information after streaming
                     if not check_index_bounds(pull_index, shape):
                         # Set the missing mask
-                        missing_mask[l, index[0], index[1], index[2]] = True
+                        missing_mask[l, index[0], index[1], index[2]] = wp.uint8(True)
 
                     # handling geometries in the interior of the computational domain
                     elif check_index_bounds(pull_index, shape) and is_interior[ii]:
                         # Set the missing mask
-                        missing_mask[l, push_index[0], push_index[1], push_index[2]] = True
+                        missing_mask[l, push_index[0], push_index[1], push_index[2]] = wp.uint8(True)
                         bc_mask[0, push_index[0], push_index[1], push_index[2]] = id_number[ii]
 
         return None, kernel
@@ -226,57 +229,38 @@ def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None, x
 
         return bc_mask, missing_mask
 
-    def _construct_neon(self):
-        # All the computation is done at the register step
-        return None, None
-
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None, xlb_grid=None):
-        import neon, typing
+        import neon
 
         # Pre-allocate arrays with maximum possible size
-        velocity_set = xlb_grid._get_velocity_set()
-        missing_mask_warp = xlb_grid._create_warp_field(cardinality=velocity_set.q, dtype=Precision.BOOL)
-        bc_mask_warp = xlb_grid._create_warp_field(cardinality=1, dtype=Precision.UINT8)
-        _, warp_kernel = self._construct_warp()
-
-        # prepare warp kernel inputs
-        total_index, wp_indices, wp_id_numbers, wp_is_interior = self._prepare_warp_kernel_inputs(bclist, bc_mask)
-
-        # Launch the warp kernel
-        wp.launch(
-            warp_kernel,
-            dim=total_index,
-            inputs=[
-                wp_indices,
-                wp_id_numbers,
-                wp_is_interior,
-                bc_mask_warp,
-                missing_mask_warp,
-            ],
+        grid_warp = grid_factory(xlb_grid.shape, compute_backend=ComputeBackend.WARP, velocity_set=self.velocity_set)
+        missing_mask_warp = grid_warp.create_field(cardinality=self.velocity_set.q, dtype=Precision.UINT8)
+        bc_mask_warp = grid_warp.create_field(cardinality=1, dtype=Precision.UINT8)
+
+        # Use indices masker with the warp backend to build bc_mask_warp and missing_mask_warp before writing in Neon DS.
+        indices_masker_warp = IndicesBoundaryMasker(
+            velocity_set=self.velocity_set,
+            precision_policy=self.precision_policy,
+            compute_backend=ComputeBackend.WARP,
         )
+        bc_mask_warp, missing_mask_warp = indices_masker_warp(bclist, bc_mask_warp, missing_mask_warp, start_index, xlb_grid)
         wp.synchronize()
 
         @neon.Container.factory("")
         def container(
-            bc_mask_warp: typing.Any,
-            missing_mask_warp: typing.Any,
-            bc_mask_field: typing.Any,
-            missing_mask_field: typing.Any,
+            bc_mask_warp: Any,
+            missing_mask_warp: Any,
+            bc_mask_field: Any,
+            missing_mask_field: Any,
         ):
             def loading_step(loader: neon.Loader):
-                grid_name = bc_mask_field.get_grid().get_name()
-                if grid_name == "mGrid":
-                    loader.set_mres_grid(bc_mask_field.get_grid(), level=0)
-                    bc_mask_hdl = loader.get_mres_write_handle(bc_mask_field)
-                    missing_mask_hdl = loader.get_mres_write_handle(missing_mask_field)
-                else:
-                    loader.set_grid(bc_mask_field.get_grid())
-                    bc_mask_hdl = loader.get_write_handle(bc_mask_field)
-                    missing_mask_hdl = loader.get_write_handle(missing_mask_field)
+                loader.set_grid(bc_mask_field.get_grid())
+                bc_mask_hdl = loader.get_write_handle(bc_mask_field)
+                missing_mask_hdl = loader.get_write_handle(missing_mask_field)
 
                 @wp.func
-                def masker(gridIdx: typing.Any):
+                def masker(gridIdx: Any):
                     cIdx = wp.neon_global_idx(bc_mask_hdl, gridIdx)
                     gx = wp.neon_get_x(cIdx)
                     gy = wp.neon_get_y(cIdx)
@@ -286,7 +270,7 @@ def masker(gridIdx: typing.Any):
                     wp.neon_write(bc_mask_hdl, gridIdx, 0, local_mask)
 
                     for q in range(self.velocity_set.q):
-                        is_missing = wp.uint8(missing_mask_warp[q, wp.neon_get_x(cIdx), wp.neon_get_z(cIdx), wp.neon_get_y(cIdx)])
+                        is_missing = wp.uint8(missing_mask_warp[q, gx, gz, gy])
                         wp.neon_write(missing_mask_hdl, gridIdx, q, is_missing)
 
                 loader.declare_kernel(masker)
diff --git a/xlb/operator/boundary_masker/multires_boundary_masker.py b/xlb/operator/boundary_masker/multires_boundary_masker.py
new file mode 100644
index 00000000..a58c5312
--- /dev/null
+++ b/xlb/operator/boundary_masker/multires_boundary_masker.py
@@ -0,0 +1,109 @@
+import warp as wp
+from xlb.compute_backend import ComputeBackend
+from xlb.operator.operator import Operator
+from xlb.grid import grid_factory
+from xlb.precision_policy import Precision
+from xlb.operator.boundary_masker import IndicesBoundaryMasker, MeshBoundaryMasker
+import neon, typing, copy
+
+
+class MultiresBoundaryMasker(Operator):
+    """
+    Operator for creating a boundary mask for multi-resolution grids
+    """
+
+    def __init__(
+        self,
+        velocity_set=None,
+        precision_policy=None,
+        compute_backend=None,
+    ):
+        # Call super
+        super().__init__(velocity_set, precision_policy, compute_backend)
+
+        # Create boundary maskers using the WARP backend
+        self.indices_masker = IndicesBoundaryMasker(
+            velocity_set=velocity_set,
+            precision_policy=precision_policy,
+            compute_backend=ComputeBackend.WARP,
+        )
+        self.mesh_masker = MeshBoundaryMasker(
+            velocity_set=velocity_set,
+            precision_policy=precision_policy,
+            compute_backend=ComputeBackend.WARP,
+        )
+
+    @Operator.register_backend(ComputeBackend.JAX)
+    def jax_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
+        raise NotImplementedError(f"Operation {self.__class__.__name} not implemented in JAX!")
+
+    @Operator.register_backend(ComputeBackend.WARP)
+    def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None, xlb_grid=None):
+        raise NotImplementedError(f"Operation {self.__class__.__name} not implemented in WARP!")
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None, xlb_grid=None):
+        # Ensure that this operator is called on multires grids
+        assert bc_mask.get_grid().get_name() == "mGrid", f"Operation {self.__class__.__name} is only applicable to multi-resolution cases"
+
+        # number of levels
+        # indices_per_level = []
+        num_levels = bc_mask.get_grid().get_num_levels()
+        for level in range(num_levels):
+            # Use the warp backend to create dense fields to be written in multi-res NEON fields
+            refinement = 2**level
+            grid_shape = tuple(x // refinement for x in xlb_grid.shape)
+            grid_dense = grid_factory(grid_shape, compute_backend=ComputeBackend.WARP)
+            missing_mask_warp = grid_dense.create_field(cardinality=self.velocity_set.q, dtype=Precision.UINT8)
+            bc_mask_warp = grid_dense.create_field(cardinality=1, dtype=Precision.UINT8)
+
+            # create a new bclist for this level only
+            bclist_level = []
+            for bc in bclist:
+                if bc.indices is not None and bc.indices[level]:
+                    bc_copy = copy.deepcopy(bc)
+                    bc_copy.indices = bc_copy.indices[level]
+                    bclist_level.append(bc_copy)
+
+            # call indices masker for this level
+            bc_mask_warp, missing_mask_warp = self.indices_masker(bclist_level, bc_mask_warp, missing_mask_warp, start_index, xlb_grid)
+
+            @neon.Container.factory("")
+            def container(
+                bc_mask_warp: typing.Any,
+                missing_mask_warp: typing.Any,
+                bc_mask_field: typing.Any,
+                missing_mask_field: typing.Any,
+            ):
+                def loading_step(loader: neon.Loader):
+                    loader.set_mres_grid(bc_mask_field.get_grid(), level)
+                    bc_mask_hdl = loader.get_mres_write_handle(bc_mask_field)
+                    missing_mask_hdl = loader.get_mres_write_handle(missing_mask_field)
+
+                    @wp.func
+                    def masker(gridIdx: typing.Any):
+                        cIdx = wp.neon_global_idx(bc_mask_hdl, gridIdx)
+                        # get local indices by dividing the global indices (associated with the finest level) by 2^level
+                        lx = wp.neon_get_x(cIdx) // refinement
+                        ly = wp.neon_get_y(cIdx) // refinement
+                        lz = wp.neon_get_z(cIdx) // refinement
+                        # TODO@Max - XLB is flattening the y dimension in 3D, while neon uses the z dimension
+                        local_mask = bc_mask_warp[0, lx, lz, ly]
+                        wp.neon_write(bc_mask_hdl, gridIdx, 0, local_mask)
+
+                        for q in range(self.velocity_set.q):
+                            is_missing = wp.uint8(missing_mask_warp[q, lx, lz, ly])
+                            wp.neon_write(missing_mask_hdl, gridIdx, q, is_missing)
+
+                    loader.declare_kernel(masker)
+
+                return loading_step
+
+            c = container(bc_mask_warp, missing_mask_warp, bc_mask, missing_mask)
+            c.run(0)
+            wp.synchronize()
+
+            del bc_mask_warp
+            del missing_mask_warp
+
+        return bc_mask, missing_mask
diff --git a/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py b/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
index e1346b1b..0e9288d1 100644
--- a/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
+++ b/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
@@ -4,9 +4,6 @@
 import warp as wp
 import os
 
-# Print the PYTHONPATH
-pythonpath = os.environ.get("PYTHONPATH", "PYTHONPATH is not set")
-print(f"PYTHONPATH: {pythonpath}")
 import neon
 from typing import Any
 
diff --git a/xlb/operator/equilibrium/quadratic_equilibrium.py b/xlb/operator/equilibrium/quadratic_equilibrium.py
index 89e2aaef..a286d6a1 100644
--- a/xlb/operator/equilibrium/quadratic_equilibrium.py
+++ b/xlb/operator/equilibrium/quadratic_equilibrium.py
@@ -4,9 +4,6 @@
 import warp as wp
 import os
 
-# Print the PYTHONPATH
-pythonpath = os.environ.get("PYTHONPATH", "PYTHONPATH is not set")
-print(f"PYTHONPATH: {pythonpath}")
 import neon
 from typing import Any
 
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 77365bc5..93ad346a 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -20,7 +20,7 @@
 from xlb.operator.boundary_condition.boundary_condition import ImplementationStep
 from xlb.operator.boundary_condition.boundary_condition_registry import boundary_condition_registry
 from xlb.operator.collision import ForcedCollision
-from xlb.operator.boundary_masker import IndicesBoundaryMasker, MeshBoundaryMasker
+from xlb.operator.boundary_masker import MultiresBoundaryMasker
 from xlb.helper import check_bc_overlaps
 
 
@@ -96,6 +96,7 @@ def prepare_fields(self, rho, u, initializer=None):
         return f_0, f_1, bc_mask, missing_mask
 
     def prepare_coalescence_count(self, coalescence_factor, bc_mask):
+        lattice_central_index = self.velocity_set.center_index
         num_levels = coalescence_factor.get_grid().get_num_levels()
 
         @neon.Container.factory(name="sum_kernel_by_level")
@@ -163,7 +164,7 @@ def compute(index: typing.Any):
                         return
 
                     for l in range(self.velocity_set.q):
-                        if l == 9:
+                        if l == lattice_central_index:
                             # HERE, we skip the center direction
                             continue
 
@@ -212,9 +213,9 @@ def compute(index: typing.Any):
     def _process_boundary_conditions(cls, boundary_conditions, bc_mask, missing_mask, xlb_grid=None):
         """Process boundary conditions and update boundary masks."""
         # Check for boundary condition overlaps
-        check_bc_overlaps(boundary_conditions, DefaultConfig.velocity_set.d, DefaultConfig.default_backend)
+        # TODO! check_bc_overlaps(boundary_conditions, DefaultConfig.velocity_set.d, DefaultConfig.default_backend)
         # Create boundary maskers
-        indices_masker = IndicesBoundaryMasker(
+        mres_masker = MultiresBoundaryMasker(
             velocity_set=DefaultConfig.velocity_set,
             precision_policy=DefaultConfig.default_precision_policy,
             compute_backend=DefaultConfig.default_backend,
@@ -224,7 +225,7 @@ def _process_boundary_conditions(cls, boundary_conditions, bc_mask, missing_mask
         bc_with_indices = [bc for bc in boundary_conditions if bc.indices is not None]
         # Process indices-based boundary conditions
         if bc_with_indices:
-            bc_mask, missing_mask = indices_masker(bc_with_indices, bc_mask, missing_mask, xlb_grid=xlb_grid)
+            bc_mask, missing_mask = mres_masker(bc_with_indices, bc_mask, missing_mask, xlb_grid=xlb_grid)
         # Process mesh-based boundary conditions for 3D
         if DefaultConfig.velocity_set.d == 3 and bc_with_vertices:
             # throw an exception because this option is not implemented yet
@@ -249,22 +250,18 @@ def _initialize_auxiliary_data(boundary_conditions, f_0, f_1, bc_mask, missing_m
 
     def _construct_neon(self):
         # Set local constants
+        lattice_central_index = self.velocity_set.center_index
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
         _missing_mask_vec = wp.vec(self.velocity_set.q, dtype=wp.uint8)
-        _opp_indices = self.velocity_set.opp_indices
-        # _cast_to_store_dtype = self.store_dtype()
 
         # Read the list of bc_to_id created upon instantiation
         bc_to_id = boundary_condition_registry.bc_to_id
-        id_to_bc = boundary_condition_registry.id_to_bc
-        _zero = self.compute_dtype(0)
+
         # Gather IDs of ExtrapolationOutflowBC boundary conditions
         extrapolation_outflow_bc_ids = []
         for bc_name, bc_id in bc_to_id.items():
             if bc_name.startswith("ExtrapolationOutflowBC"):
                 extrapolation_outflow_bc_ids.append(bc_id)
-        # Group active boundary conditions
-        active_bcs = set(boundary_condition_registry.id_to_bc[bc.id] for bc in self.boundary_conditions)
 
         @wp.func
         def apply_bc(
@@ -439,7 +436,7 @@ def cl_stream_coarse(index: typing.Any):
                     _f_post_stream = self.stream.neon_functional(f_0_pn, index)
 
                     for l in range(self.velocity_set.q):
-                        if l == 9:
+                        if l == lattice_central_index:
                             # HERE, we skip the center direction
                             continue
 
@@ -589,7 +586,7 @@ def cl_stream_coarse(index: typing.Any):
                         return
 
                     for l in range(self.velocity_set.q):
-                        if l == 9:
+                        if l == lattice_central_index:
                             # HERE, we skip the center direction
                             continue
 
diff --git a/xlb/velocity_set/d3q19.py b/xlb/velocity_set/d3q19.py
index 48c1fb29..c2a9ab4c 100644
--- a/xlb/velocity_set/d3q19.py
+++ b/xlb/velocity_set/d3q19.py
@@ -19,7 +19,6 @@ def __init__(self, precision_policy, compute_backend):
         c = np.array([ci for ci in itertools.product([0, -1, 1], repeat=3) if np.sum(np.abs(ci)) <= 2]).T
         w = np.zeros(19)
         for i in range(19):
-            print(f"{i} -> c[:, i] = {c[:, i]}")
             if np.sum(np.abs(c[:, i])) == 0:
                 w[i] = 1.0 / 3.0
             elif np.sum(np.abs(c[:, i])) == 1:
diff --git a/xlb/velocity_set/velocity_set.py b/xlb/velocity_set/velocity_set.py
index 5c998596..da3fc6f2 100644
--- a/xlb/velocity_set/velocity_set.py
+++ b/xlb/velocity_set/velocity_set.py
@@ -74,6 +74,7 @@ def _init_numpy_properties(self, c, w):
         self.main_indices = self._construct_main_indices()
         self.right_indices = self._construct_right_indices()
         self.left_indices = self._construct_left_indices()
+        self.center_index = self._get_center_index()
 
     def _init_warp_properties(self):
         """
@@ -88,6 +89,9 @@ def _init_warp_properties(self):
         self.qi = wp.constant(wp.mat((self.q, self.d * (self.d + 1) // 2), dtype=dtype)(self._qi))
 
     def _init_neon_properties(self):
+        """
+        Convert NumPy properties to Neon-specific properties which are identical to Warp.
+        """
         self._init_warp_properties()
 
     def _init_jax_properties(self):
@@ -225,6 +229,23 @@ def _construct_left_indices(self):
         """
         return np.nonzero(self._c.T[:, 0] == -1)[0]
 
+    def _get_center_index(self):
+        """
+        This function returns the index of the center point in the lattice associated with (0,0,0)
+
+        Returns
+        -------
+        numpy.ndarray
+            The index of the zero lattice velocity.
+        """
+        arr = self._c.T
+        if self.d == 2:
+            target = np.array([0, 0])
+        else:
+            target = np.array([0, 0, 0])
+        match = np.all(arr == target, axis=1)
+        return int(np.nonzero(match)[0][0])
+
     def __str__(self):
         """
         This function returns the name of the lattice in the format of DxQy.

From 4614620ff3ccd6ba9df5ea4d6a185f93b3eacb78 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Thu, 22 May 2025 21:27:22 -0400
Subject: [PATCH 042/208] Added KBC to the Neon backend

---
 .../3_levels_mlups_3d_multires_solver.py      | 22 +++++++++----------
 xlb/operator/collision/kbc.py                 |  7 ++++++
 xlb/operator/macroscopic/second_moment.py     |  9 ++++++++
 3 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/examples/performance/3_levels_mlups_3d_multires_solver.py b/examples/performance/3_levels_mlups_3d_multires_solver.py
index a9611f44..ac7d4064 100644
--- a/examples/performance/3_levels_mlups_3d_multires_solver.py
+++ b/examples/performance/3_levels_mlups_3d_multires_solver.py
@@ -61,7 +61,7 @@ def setup_simulation(args):
         default_precision_policy=precision_policy,
     )
 
-    return compute_backend, precision_policy
+    return velocity_set
 
 
 # def construct_indices_per_level(grid_shape_finest, indices_finest, active_voxels_mask_per_level, level_origins):
@@ -175,9 +175,8 @@ def problem2(grid_shape, velocity_set):
     return grid, lid, walls
 
 
-def run(compute_backend, precision_policy, grid_shape, num_steps):
+def run(velocity_set, grid_shape, num_steps):
     # Create grid and setup boundary conditions
-    velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=compute_backend)
 
     # Convert indices to list of indices per level
     # TODO: overlaps emerge if bc indices are orignally specified at the finest grid and they exist at the coarser levels
@@ -198,14 +197,13 @@ def run(compute_backend, precision_policy, grid_shape, num_steps):
     ]
 
     # Create stepper
-    stepper = MultiresIncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="BGK")
+    stepper = MultiresIncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="KBC")
 
-    # Re = 5000.0
-
-    # clength = grid_shape[0] - 1
-    # visc = prescribed_vel * clength / Re
-    # omega = 1.0 / (3.0 * visc + 0.5)
-    omega = 1.0
+    Re = 5000.0
+    clength = grid_shape[0] - 1
+    visc = prescribed_vel * clength / Re
+    omega = 1.0 / (3.0 * visc + 0.5)
+    # omega = 1.0
 
     sim = xlb.helper.Nse_multires_simulation(grid, velocity_set, stepper, omega)
 
@@ -250,9 +248,9 @@ def calculate_mlups(cube_edge, num_steps, elapsed_time, num_levels):
 
 def main():
     args = parse_arguments()
-    compute_backend, precision_policy = setup_simulation(args)
+    velocity_set = setup_simulation(args)
     grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
-    stats = run(compute_backend, precision_policy, grid_shape, args.num_steps)
+    stats = run(velocity_set, grid_shape, args.num_steps)
     mlups_stats = calculate_mlups(args.cube_edge, args.num_steps, stats["time"], stats["num_levels"])
 
     print(f"Simulation completed in {stats['time']:.2f} seconds")
diff --git a/xlb/operator/collision/kbc.py b/xlb/operator/collision/kbc.py
index 7724e07e..c6528a3d 100644
--- a/xlb/operator/collision/kbc.py
+++ b/xlb/operator/collision/kbc.py
@@ -330,6 +330,13 @@ def kernel(
 
         return functional, kernel
 
+    def _construct_neon(self):
+        # Redefine the momentum flux operator for the neon backend
+        # This is because the neon backend relies on the warp functionals for its operations.
+        self.momentum_flux = MomentumFlux(compute_backend=ComputeBackend.WARP)
+        functional, _ = self._construct_warp()
+        return functional, None
+
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(self, f, feq, fout, rho, u, omega):
         # Launch the warp kernel
diff --git a/xlb/operator/macroscopic/second_moment.py b/xlb/operator/macroscopic/second_moment.py
index 6c7e70ea..3102d584 100644
--- a/xlb/operator/macroscopic/second_moment.py
+++ b/xlb/operator/macroscopic/second_moment.py
@@ -104,3 +104,12 @@ def warp_implementation(self, f, pi):
         # Launch the warp kernel
         wp.launch(self.warp_kernel, inputs=[f, pi], dim=pi.shape[1:])
         return pi
+
+    def _construct_neon(self):
+        functional, _ = self._construct_warp()
+        return functional, None
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(self, f, rho):
+        # rise exception as this feature is not implemented yet
+        raise NotImplementedError("This feature is not implemented in XLB with the NEON backend yet.")
\ No newline at end of file

From 57369f58d17a23684b31b892b2a24dddb0183772 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 23 May 2025 10:12:16 -0400
Subject: [PATCH 043/208] renamed some files and classes

---
 ...ultires_solver.py => mlups_3d_multires.py} |  16 +-
 ...l.py => mlups_3d_multires_single_level.py} |   2 +-
 .../performance/mlups_3d_multires_solver.py   | 221 ------------------
 xlb/helper/__init__.py                        |   4 +-
 xlb/helper/{nse_solver.py => nse_fields.py}   |   1 -
 ...ltires_solver.py => simulation_manager.py} |   9 +-
 xlb/operator/macroscopic/second_moment.py     |   2 +-
 xlb/operator/stepper/nse_stepper.py           |   2 +-
 8 files changed, 16 insertions(+), 241 deletions(-)
 rename examples/performance/{3_levels_mlups_3d_multires_solver.py => mlups_3d_multires.py} (96%)
 rename examples/performance/{mlups_3d_multires_solver_single_level.py => mlups_3d_multires_single_level.py} (98%)
 delete mode 100644 examples/performance/mlups_3d_multires_solver.py
 rename xlb/helper/{nse_solver.py => nse_fields.py} (99%)
 rename xlb/helper/{nse_multires_solver.py => simulation_manager.py} (96%)

diff --git a/examples/performance/3_levels_mlups_3d_multires_solver.py b/examples/performance/mlups_3d_multires.py
similarity index 96%
rename from examples/performance/3_levels_mlups_3d_multires_solver.py
rename to examples/performance/mlups_3d_multires.py
index ac7d4064..479e00fc 100644
--- a/examples/performance/3_levels_mlups_3d_multires_solver.py
+++ b/examples/performance/mlups_3d_multires.py
@@ -11,6 +11,7 @@
 from xlb.operator.stepper import MultiresIncompressibleNavierStokesStepper
 from xlb.operator.boundary_condition import FullwayBounceBackBC, EquilibriumBC
 
+
 def parse_arguments():
     parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation (BGK)")
     # Positional arguments
@@ -118,11 +119,11 @@ def get_peeled_np(level, width):
 
     def get_levels(num_levels):
         levels = []
-        for i in range(num_levels-1):
+        for i in range(num_levels - 1):
             l = get_peeled_np(i, 8)
             levels.append(l)
         lastLevel = num_levels - 1
-        divider = 2 ** lastLevel
+        divider = 2**lastLevel
         m = neon.Index_3d(dim.x // divider + 1, dim.y // divider + 1, dim.z // divider + 1)
         lastLevel = np.ones((m.x, m.y, m.z), dtype=int)
         lastLevel = np.ascontiguousarray(lastLevel, dtype=np.int32)
@@ -132,9 +133,12 @@ def get_levels(num_levels):
     num_levels = 4
     levels = get_levels(num_levels)
 
-    grid = multires_grid_factory(grid_shape, velocity_set=velocity_set,
-                                 sparsity_pattern_list=levels,
-                                 sparsity_pattern_origins=[ neon.Index_3d(0, 0, 0)]*len(levels),)
+    grid = multires_grid_factory(
+        grid_shape,
+        velocity_set=velocity_set,
+        sparsity_pattern_list=levels,
+        sparsity_pattern_origins=[neon.Index_3d(0, 0, 0)] * len(levels),
+    )
 
     box = grid.bounding_box_indices()
     box_no_edge = grid.bounding_box_indices(remove_edges=True)
@@ -205,7 +209,7 @@ def run(velocity_set, grid_shape, num_steps):
     omega = 1.0 / (3.0 * visc + 0.5)
     # omega = 1.0
 
-    sim = xlb.helper.Nse_multires_simulation(grid, velocity_set, stepper, omega)
+    sim = xlb.helper.MultiResSimulationManager(grid, velocity_set, stepper, omega)
 
     # sim.export_macroscopic("Initial_")
     # sim.step()
diff --git a/examples/performance/mlups_3d_multires_solver_single_level.py b/examples/performance/mlups_3d_multires_single_level.py
similarity index 98%
rename from examples/performance/mlups_3d_multires_solver_single_level.py
rename to examples/performance/mlups_3d_multires_single_level.py
index ebb3c7df..0c5614e4 100644
--- a/examples/performance/mlups_3d_multires_solver_single_level.py
+++ b/examples/performance/mlups_3d_multires_single_level.py
@@ -126,7 +126,7 @@ def peel(dim, idx, peel_level, outwards):
     # # Initialize fields and run simulation
     # omega = 1.0
 
-    sim = xlb.helper.Nse_multires_simulation(grid, velocity_set, stepper, omega)
+    sim = xlb.helper.MultiResSimulationManager(grid, velocity_set, stepper, omega)
 
     sim.export_macroscopic("Initial_")
 
diff --git a/examples/performance/mlups_3d_multires_solver.py b/examples/performance/mlups_3d_multires_solver.py
deleted file mode 100644
index 3b555ea7..00000000
--- a/examples/performance/mlups_3d_multires_solver.py
+++ /dev/null
@@ -1,221 +0,0 @@
-import xlb
-import argparse
-import time
-import warp as wp
-import numpy as np
-
-# add a directory to the PYTHON PATH
-import sys
-
-# sys.path.append('/home/max/repos/neon/warping/neon_warp_testing/neon_py_bindings/py/')
-import neon
-
-from xlb.compute_backend import ComputeBackend
-from xlb.precision_policy import PrecisionPolicy
-from xlb.grid import multires_grid_factory
-from xlb.operator.stepper import MultiresIncompressibleNavierStokesStepper
-from xlb.operator.boundary_condition import FullwayBounceBackBC, EquilibriumBC
-from xlb.distribute import distribute
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation (BGK)")
-    # Positional arguments
-    parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid")
-    parser.add_argument("num_steps", type=int, help="Timestep for the simulation")
-    parser.add_argument("backend", type=str, help="Backend for the simulation (jax, warp or neon)")
-    parser.add_argument("precision", type=str, help="Precision for the simulation (e.g., fp32/fp32)")
-
-    # Optional arguments
-    parser.add_argument("--num_devices", type=int, default=0, help="Number of devices for the simulation (default: 0)")
-    parser.add_argument("--velocity_set", type=str, default="D3Q19", help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)")
-
-    return parser.parse_args()
-
-
-def setup_simulation(args):
-    backend = None
-    if args.backend == "jax":
-        backend = ComputeBackend.JAX
-    elif args.backend == "warp":
-        backend = ComputeBackend.WARP
-    elif args.backend == "neon":
-        backend = ComputeBackend.NEON
-    if backend is None:
-        raise ValueError("Invalid backend")
-
-    precision_policy_map = {
-        "fp32/fp32": PrecisionPolicy.FP32FP32,
-        "fp64/fp64": PrecisionPolicy.FP64FP64,
-        "fp64/fp32": PrecisionPolicy.FP64FP32,
-        "fp32/fp16": PrecisionPolicy.FP32FP16,
-    }
-    precision_policy = precision_policy_map.get(args.precision)
-    if precision_policy is None:
-        raise ValueError("Invalid precision")
-
-    velocity_set = None
-    if args.velocity_set == "D3Q19":
-        velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
-    elif args.velocity_set == "D3Q27":
-        velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, backend=backend)
-    if velocity_set is None:
-        raise ValueError("Invalid velocity set")
-
-    xlb.init(
-        velocity_set=velocity_set,
-        default_backend=backend,
-        default_precision_policy=precision_policy,
-    )
-
-    return backend, precision_policy
-
-
-def run(backend, precision_policy, grid_shape, num_steps):
-    # Create grid and setup boundary conditions
-    velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
-
-    def peel(dim, idx, peel_level, outwards):
-        if outwards:
-            xIn = idx.x <= peel_level or idx.x >= dim.x - 1 - peel_level
-            yIn = idx.y <= peel_level or idx.y >= dim.y - 1 - peel_level
-            zIn = idx.z <= peel_level or idx.z >= dim.z - 1 - peel_level
-            return xIn or yIn or zIn
-        else:
-            xIn = idx.x >= peel_level and idx.x <= dim.x - 1 - peel_level
-            yIn = idx.y >= peel_level and idx.y <= dim.y - 1 - peel_level
-            zIn = idx.z >= peel_level and idx.z <= dim.z - 1 - peel_level
-            return xIn and yIn and zIn
-
-    dim = neon.Index_3d(grid_shape[0], grid_shape[1], grid_shape[2])
-    level_zero_mask = np.zeros((dim.x, dim.y, dim.z), dtype=int)
-    level_zero_mask = np.ascontiguousarray(level_zero_mask, dtype=np.int32)
-    # loop over all the elements in level_zero_mask and set to one any that have x=0 or y=0 or z=0
-    for i in range(dim.x):
-        for j in range(dim.y):
-            for k in range(dim.z):
-                idx = neon.Index_3d(i, j, k)
-                val = 0
-                if peel(dim, idx, dim.x / 9, True):
-                    val = 1
-                level_zero_mask[i, j, k] = val
-
-    m = neon.Index_3d(dim.x // 2, dim.y // 2, dim.z // 2)
-    level_one_mask = np.ones((m.x, m.y, m.z), dtype=int)
-    for i in range(m.x):
-        for j in range(m.x):
-            for k in range(m.x):
-                idx = neon.Index_3d(i, j, k)
-                val = 1
-                level_one_mask[i, j, k] = val
-
-    level_one_mask = np.ascontiguousarray(level_one_mask, dtype=np.int32)
-
-    grid = multires_grid_factory(
-        grid_shape,
-        velocity_set=velocity_set,
-        sparsity_pattern_list=[
-            level_zero_mask,
-            level_one_mask,
-        ],
-        sparsity_pattern_origins=[
-            neon.Index_3d(0, 0, 0),
-            neon.Index_3d(0, 0, 0),
-        ],
-    )
-
-    box = grid.bounding_box_indices()
-    box_no_edge = grid.bounding_box_indices(remove_edges=True)
-    lid = box_no_edge["top"]
-    walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in range(len(grid.shape))]
-    walls = np.unique(np.array(walls), axis=-1).tolist()
-
-    prescribed_vel = 0.05
-
-    boundary_conditions = [
-        EquilibriumBC(rho=1.0, u=(prescribed_vel, 0.0, 0.0), indices=lid),
-        EquilibriumBC(rho=1.0, u=(0.0, 0.0, 0.0), indices=walls),
-    ]
-
-    # Create stepper
-    stepper = MultiresIncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="BGK")
-
-    Re = 1000.0
-
-    clength = grid_shape[0] - 1
-    visc = prescribed_vel * clength / Re
-    omega = 1.0 / (3.0 * visc + 0.5)
-    omega = 1.0
-
-    # # Initialize fields and run simulation
-    # omega = 1.0
-
-    sim = xlb.helper.Nse_multires_simulation(grid, velocity_set, stepper, omega)
-
-    sim.export_macroscopic("Initial_")
-
-    print("start timing")
-    start_time = time.time()
-
-    for i in range(num_steps):
-        print(f"step {i}")
-        sim.step()
-        if i % 10 == 0:
-            sim.export_macroscopic("u_lid_driven_cavity_")
-    wp.synchronize()
-    t = time.time() - start_time
-
-    sim.export_macroscopic("u_lid_driven_cavity_")
-    return t
-
-
-def calculate_mlups(cube_edge, num_steps, elapsed_time):
-    total_lattice_updates = cube_edge**3 * num_steps
-    mlups = (total_lattice_updates / elapsed_time) / 1e6
-    return mlups
-
-
-def post_process(macro, rho, u, f_0, i):
-    # Write the results. We'll use JAX backend for the post-processing
-    # import jax.numpy as jnp
-    # if not isinstance(f_0, jnp.ndarray):
-    #     # If the backend is warp, we need to drop the last dimension added by warp for 2D simulations
-    #     f_0 = wp.to_jax(f_0)[..., 0]
-    # else:
-    #     f_0 = f_0
-    rho, u = macro(f_0, rho, u)
-    wp.synchronize()
-    u.update_host(0)
-    rho.update_host(0)
-    wp.synchronize()
-    u.export_vti(f"u_lid_driven_cavity_{i}.vti", "u")
-    rho.export_vti(f"rho_lid_driven_cavity_{i}.vti", "rho")
-
-    pass
-
-    # # remove boundary cells
-    # rho = rho[:, 1:-1, 1:-1, 1:-1]
-    # u = u[:, 1:-1, 1:-1, 1:-1]
-    # u_magnitude = (u[0] ** 2 + u[1] ** 2) ** 0.5
-    #
-    # fields = {"rho": rho[0], "u_x": u[0], "u_y": u[1], "u_magnitude": u_magnitude}
-    #
-    # # save_fields_vtk(fields, timestep=i, prefix="lid_driven_cavity")
-    # ny=fields["u_magnitude"].shape[1]
-    # from xlb.utils import  save_image
-    # save_image(fields["u_magnitude"][:, ny//2, :], timestep=i, prefix="lid_driven_cavity")
-
-
-def main():
-    args = parse_arguments()
-    backend, precision_policy = setup_simulation(args)
-    grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
-    elapsed_time = run(backend, precision_policy, grid_shape, args.num_steps)
-    mlups = calculate_mlups(args.cube_edge, args.num_steps, elapsed_time)
-
-    print(f"Simulation completed in {elapsed_time:.2f} seconds")
-    print(f"MLUPs: {mlups:.2f}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/xlb/helper/__init__.py b/xlb/helper/__init__.py
index 1a3de972..a6f0dd7c 100644
--- a/xlb/helper/__init__.py
+++ b/xlb/helper/__init__.py
@@ -1,5 +1,5 @@
-from xlb.helper.nse_solver import create_nse_fields
+from xlb.helper.nse_fields import create_nse_fields
 from xlb.helper.initializers import initialize_eq
 from xlb.helper.initializers import initialize_multires_eq
 from xlb.helper.check_boundary_overlaps import check_bc_overlaps
-from xlb.helper.nse_multires_solver import Nse_multires_simulation
+from xlb.helper.simulation_manager import MultiResSimulationManager
diff --git a/xlb/helper/nse_solver.py b/xlb/helper/nse_fields.py
similarity index 99%
rename from xlb/helper/nse_solver.py
rename to xlb/helper/nse_fields.py
index 075f6ab3..c56eb07c 100644
--- a/xlb/helper/nse_solver.py
+++ b/xlb/helper/nse_fields.py
@@ -2,7 +2,6 @@
 from xlb.grid import grid_factory
 from xlb.precision_policy import Precision
 from typing import Tuple
-import neon
 
 
 def create_nse_fields(
diff --git a/xlb/helper/nse_multires_solver.py b/xlb/helper/simulation_manager.py
similarity index 96%
rename from xlb/helper/nse_multires_solver.py
rename to xlb/helper/simulation_manager.py
index 800ef7b4..5df80687 100644
--- a/xlb/helper/nse_multires_solver.py
+++ b/xlb/helper/simulation_manager.py
@@ -1,14 +1,7 @@
-import numpy as np
-
-from xlb import DefaultConfig
-from xlb.grid.multires_grid import NeonMultiresGrid
-from xlb.precision_policy import Precision
-from typing import Tuple, List
 import neon
-import warp as wp
 
 
-class Nse_multires_simulation:
+class MultiResSimulationManager:
     def __init__(self, grid, velocity_set, stepper, omega):
         self.stepper = stepper
         self.grid = stepper.get_grid()
diff --git a/xlb/operator/macroscopic/second_moment.py b/xlb/operator/macroscopic/second_moment.py
index 3102d584..ee74bdd9 100644
--- a/xlb/operator/macroscopic/second_moment.py
+++ b/xlb/operator/macroscopic/second_moment.py
@@ -112,4 +112,4 @@ def _construct_neon(self):
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, f, rho):
         # rise exception as this feature is not implemented yet
-        raise NotImplementedError("This feature is not implemented in XLB with the NEON backend yet.")
\ No newline at end of file
+        raise NotImplementedError("This feature is not implemented in XLB with the NEON backend yet.")
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index d3a389e8..33b8690b 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -22,7 +22,7 @@
 from xlb.operator.collision import ForcedCollision
 from xlb.operator.boundary_masker import IndicesBoundaryMasker, MeshBoundaryMasker
 from xlb.helper import check_bc_overlaps
-from xlb.helper.nse_solver import create_nse_fields
+from xlb.helper.nse_fields import create_nse_fields
 
 
 class IncompressibleNavierStokesStepper(Stepper):

From 3af781a87bdb24ebaeea431e83f68899ad7c9410 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 23 May 2025 15:36:02 -0400
Subject: [PATCH 044/208] added Neon backend to a couple BCs

---
 xlb/operator/boundary_condition/bc_do_nothing.py     |  9 +++++++++
 xlb/operator/boundary_condition/bc_equilibrium.py    |  3 +--
 xlb/operator/boundary_condition/bc_zouhe.py          | 12 ++++++++++++
 xlb/operator/equilibrium/__init__.py                 |  3 ++-
 .../equilibrium/mulltires_quadratic_equilibrium.py   |  2 +-
 xlb/operator/equilibrium/quadratic_equilibrium.py    |  2 +-
 6 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/xlb/operator/boundary_condition/bc_do_nothing.py b/xlb/operator/boundary_condition/bc_do_nothing.py
index aeefd788..91713b8a 100644
--- a/xlb/operator/boundary_condition/bc_do_nothing.py
+++ b/xlb/operator/boundary_condition/bc_do_nothing.py
@@ -74,3 +74,12 @@ def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
             dim=f_pre.shape[1:],
         )
         return f_post
+
+    def _construct_neon(self):
+        functional, _ = self._construct_warp()
+        return functional, None
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(self, f_pre, f_post, bc_mask, missing_mask):
+        # rise exception as this feature is not implemented yet
+        raise NotImplementedError("This feature is not implemented in XLB with the NEON backend yet.")
diff --git a/xlb/operator/boundary_condition/bc_equilibrium.py b/xlb/operator/boundary_condition/bc_equilibrium.py
index fbfb2449..b7e12e95 100644
--- a/xlb/operator/boundary_condition/bc_equilibrium.py
+++ b/xlb/operator/boundary_condition/bc_equilibrium.py
@@ -12,8 +12,7 @@
 from xlb.velocity_set.velocity_set import VelocitySet
 from xlb.precision_policy import PrecisionPolicy
 from xlb.compute_backend import ComputeBackend
-from xlb.operator.equilibrium.equilibrium import Equilibrium
-from xlb.operator.equilibrium import QuadraticEquilibrium
+from xlb.operator.equilibrium import Equilibrium, QuadraticEquilibrium
 from xlb.operator.operator import Operator
 from xlb.operator.boundary_condition.boundary_condition import (
     ImplementationStep,
diff --git a/xlb/operator/boundary_condition/bc_zouhe.py b/xlb/operator/boundary_condition/bc_zouhe.py
index 5cad5048..5953baf1 100644
--- a/xlb/operator/boundary_condition/bc_zouhe.py
+++ b/xlb/operator/boundary_condition/bc_zouhe.py
@@ -360,3 +360,15 @@ def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
             dim=f_pre.shape[1:],
         )
         return f_post
+
+    def _construct_neon(self):
+        # Redefine the quadratic eq operator for the neon backend
+        # This is because the neon backend relies on the warp functionals for its operations.
+        self.equilibrium_operator = QuadraticEquilibrium(compute_backend=ComputeBackend.WARP)
+        functional, _ = self._construct_warp()
+        return functional, None
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(self, f_pre, f_post, bc_mask, missing_mask):
+        # rise exception as this feature is not implemented yet
+        raise NotImplementedError("This feature is not implemented in XLB with the NEON backend yet.")
\ No newline at end of file
diff --git a/xlb/operator/equilibrium/__init__.py b/xlb/operator/equilibrium/__init__.py
index 474bdf5c..372ae1f7 100644
--- a/xlb/operator/equilibrium/__init__.py
+++ b/xlb/operator/equilibrium/__init__.py
@@ -1,2 +1,3 @@
-from xlb.operator.equilibrium.quadratic_equilibrium import Equilibrium, QuadraticEquilibrium
+from xlb.operator.equilibrium.equilibrium import Equilibrium
+from xlb.operator.equilibrium.quadratic_equilibrium import QuadraticEquilibrium
 from xlb.operator.equilibrium.mulltires_quadratic_equilibrium import MultiresQuadraticEquilibrium
diff --git a/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py b/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
index 0e9288d1..f88f3c3d 100644
--- a/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
+++ b/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
@@ -8,7 +8,7 @@
 from typing import Any
 
 from xlb.compute_backend import ComputeBackend
-from xlb.operator.equilibrium.equilibrium import Equilibrium
+from xlb.operator.equilibrium import Equilibrium
 from xlb.operator import Operator
 
 
diff --git a/xlb/operator/equilibrium/quadratic_equilibrium.py b/xlb/operator/equilibrium/quadratic_equilibrium.py
index a286d6a1..c9fc05d7 100644
--- a/xlb/operator/equilibrium/quadratic_equilibrium.py
+++ b/xlb/operator/equilibrium/quadratic_equilibrium.py
@@ -8,7 +8,7 @@
 from typing import Any
 
 from xlb.compute_backend import ComputeBackend
-from xlb.operator.equilibrium.equilibrium import Equilibrium
+from xlb.operator.equilibrium import Equilibrium
 from xlb.operator import Operator
 
 

From 2d3a502eb0499e79b5224c8a3c4ffd7cc3336656 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 23 May 2025 17:17:15 -0400
Subject: [PATCH 045/208] converted missing mask from boolean to uint8
 everywhere.

---
 xlb/operator/boundary_condition/bc_zouhe.py           |  2 +-
 xlb/operator/boundary_condition/boundary_condition.py |  4 ++--
 .../boundary_condition/helper_functions_bc.py         |  2 +-
 xlb/operator/boundary_masker/mesh_boundary_masker.py  |  4 ++--
 .../boundary_masker/multires_boundary_masker.py       | 11 +++--------
 xlb/operator/force/momentum_transfer.py               |  2 +-
 6 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/xlb/operator/boundary_condition/bc_zouhe.py b/xlb/operator/boundary_condition/bc_zouhe.py
index 5953baf1..a1812255 100644
--- a/xlb/operator/boundary_condition/bc_zouhe.py
+++ b/xlb/operator/boundary_condition/bc_zouhe.py
@@ -371,4 +371,4 @@ def _construct_neon(self):
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, f_pre, f_post, bc_mask, missing_mask):
         # rise exception as this feature is not implemented yet
-        raise NotImplementedError("This feature is not implemented in XLB with the NEON backend yet.")
\ No newline at end of file
+        raise NotImplementedError("This feature is not implemented in XLB with the NEON backend yet.")
diff --git a/xlb/operator/boundary_condition/boundary_condition.py b/xlb/operator/boundary_condition/boundary_condition.py
index 0b8a93c6..01e57d4d 100644
--- a/xlb/operator/boundary_condition/boundary_condition.py
+++ b/xlb/operator/boundary_condition/boundary_condition.py
@@ -94,7 +94,7 @@ def kernel(
             f_pre: wp.array4d(dtype=Any),
             f_post: wp.array4d(dtype=Any),
             bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
+            missing_mask: wp.array4d(dtype=wp.uint8),
         ):
             # Get the global index
             i, j, k = wp.tid()
@@ -132,7 +132,7 @@ def aux_data_init_kernel(
             f_0: wp.array4d(dtype=Any),
             f_1: wp.array4d(dtype=Any),
             bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
+            missing_mask: wp.array4d(dtype=wp.uint8),
         ):
             # Get the global index
             i, j, k = wp.tid()
diff --git a/xlb/operator/boundary_condition/helper_functions_bc.py b/xlb/operator/boundary_condition/helper_functions_bc.py
index 6f8e768b..269a9f3a 100644
--- a/xlb/operator/boundary_condition/helper_functions_bc.py
+++ b/xlb/operator/boundary_condition/helper_functions_bc.py
@@ -38,7 +38,7 @@ def get_thread_data(
             f_pre: wp.array4d(dtype=Any),
             f_post: wp.array4d(dtype=Any),
             bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
+            missing_mask: wp.array4d(dtype=wp.uint8),
             index: wp.vec3i,
         ):
             # Get the boundary id and missing mask
diff --git a/xlb/operator/boundary_masker/mesh_boundary_masker.py b/xlb/operator/boundary_masker/mesh_boundary_masker.py
index 40dd0311..40647462 100644
--- a/xlb/operator/boundary_masker/mesh_boundary_masker.py
+++ b/xlb/operator/boundary_masker/mesh_boundary_masker.py
@@ -155,7 +155,7 @@ def kernel(
             mesh_id: wp.uint64,
             id_number: wp.int32,
             bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
+            missing_mask: wp.array4d(dtype=wp.uint8),
         ):
             # get index
             i, j, k = wp.tid()
@@ -180,7 +180,7 @@ def kernel(
                         # We know we have a solid neighbor
                         # Set the boundary id and missing_mask
                         bc_mask[0, index[0], index[1], index[2]] = wp.uint8(id_number)
-                        missing_mask[_opp_indices[l], index[0], index[1], index[2]] = True
+                        missing_mask[_opp_indices[l], index[0], index[1], index[2]] = wp.uint8(True)
 
         return None, kernel
 
diff --git a/xlb/operator/boundary_masker/multires_boundary_masker.py b/xlb/operator/boundary_masker/multires_boundary_masker.py
index a58c5312..d3212975 100644
--- a/xlb/operator/boundary_masker/multires_boundary_masker.py
+++ b/xlb/operator/boundary_masker/multires_boundary_masker.py
@@ -18,6 +18,9 @@ def __init__(
         precision_policy=None,
         compute_backend=None,
     ):
+        if compute_backend in [ComputeBackend.JAX, ComputeBackend.WARP]:
+            raise NotImplementedError(f"Operator {self.__class__.__name} not supported in {compute_backend} backend!")
+
         # Call super
         super().__init__(velocity_set, precision_policy, compute_backend)
 
@@ -33,14 +36,6 @@ def __init__(
             compute_backend=ComputeBackend.WARP,
         )
 
-    @Operator.register_backend(ComputeBackend.JAX)
-    def jax_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
-        raise NotImplementedError(f"Operation {self.__class__.__name} not implemented in JAX!")
-
-    @Operator.register_backend(ComputeBackend.WARP)
-    def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None, xlb_grid=None):
-        raise NotImplementedError(f"Operation {self.__class__.__name} not implemented in WARP!")
-
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None, xlb_grid=None):
         # Ensure that this operator is called on multires grids
diff --git a/xlb/operator/force/momentum_transfer.py b/xlb/operator/force/momentum_transfer.py
index 1c6255d3..dda43650 100644
--- a/xlb/operator/force/momentum_transfer.py
+++ b/xlb/operator/force/momentum_transfer.py
@@ -110,7 +110,7 @@ def kernel(
             f_0: wp.array4d(dtype=Any),
             f_1: wp.array4d(dtype=Any),
             bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
+            missing_mask: wp.array4d(dtype=wp.uint8),
             force: wp.array(dtype=Any),
         ):
             # Get the global index

From b0a517bfbfde44108bc90d4542bb5222ff49ad48 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 23 May 2025 22:06:32 -0400
Subject: [PATCH 046/208] Heavily relying on existing warp functionals for the
 Neon backend of operators

---
 .../boundary_condition/bc_equilibrium.py      | 31 ++-----
 .../multires_boundary_masker.py               |  6 +-
 .../mulltires_quadratic_equilibrium.py        | 53 +++---------
 .../equilibrium/quadratic_equilibrium.py      | 41 ++--------
 xlb/operator/macroscopic/macroscopic.py       | 80 ++-----------------
 .../macroscopic/multires_macroscopic.py       | 69 +++-------------
 6 files changed, 45 insertions(+), 235 deletions(-)

diff --git a/xlb/operator/boundary_condition/bc_equilibrium.py b/xlb/operator/boundary_condition/bc_equilibrium.py
index b7e12e95..7b81f1b4 100644
--- a/xlb/operator/boundary_condition/bc_equilibrium.py
+++ b/xlb/operator/boundary_condition/bc_equilibrium.py
@@ -90,32 +90,13 @@ def functional(
         return functional, kernel
 
     def _construct_neon(self):
-        # Set local constants TODO: This is a hack and should be fixed with warp update
-        _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
-        _rho = self.compute_dtype(self.rho)
-        _u = _u_vec(self.u[0], self.u[1], self.u[2]) if self.velocity_set.d == 3 else _u_vec(self.u[0], self.u[1])
-
-        # Construct the functional for this BC
-        @wp.func
-        def functional(
-            index: Any,
-            timestep: Any,
-            missing_mask: Any,
-            f_0: Any,
-            f_1: Any,
-            f_pre: Any,
-            f_post: Any,
-        ):
-            # we can use directly the warp_functional method from the equilibrium operator
-            # the Neon implementation is the same as the Warp implementation as all the computation
-            # is done at the register level
-            _f = self.equilibrium_operator.neon_functional(_rho, _u)
-            return _f
+        # Redefine the equilibrium operators for the neon backend
+        # This is because the neon backend relies on the warp functionals for its operations.
+        self.equilibrium_operator = QuadraticEquilibrium(compute_backend=ComputeBackend.WARP)
 
-        # Use the parent class's kernel and pass the functional
-        kernel = None
-
-        return functional, kernel
+        # Use the warp functional for the NEON backend
+        functional, _ = self._construct_warp()
+        return functional, None
 
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_launch(self, f_pre, f_post, bc_mask, missing_mask):
diff --git a/xlb/operator/boundary_masker/multires_boundary_masker.py b/xlb/operator/boundary_masker/multires_boundary_masker.py
index d3212975..1b197143 100644
--- a/xlb/operator/boundary_masker/multires_boundary_masker.py
+++ b/xlb/operator/boundary_masker/multires_boundary_masker.py
@@ -19,7 +19,7 @@ def __init__(
         compute_backend=None,
     ):
         if compute_backend in [ComputeBackend.JAX, ComputeBackend.WARP]:
-            raise NotImplementedError(f"Operator {self.__class__.__name} not supported in {compute_backend} backend!")
+            raise NotImplementedError(f"Operator {self.__class__.__name} not supported in {compute_backend} backend.")
 
         # Call super
         super().__init__(velocity_set, precision_policy, compute_backend)
@@ -56,8 +56,8 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None, x
             bclist_level = []
             for bc in bclist:
                 if bc.indices is not None and bc.indices[level]:
-                    bc_copy = copy.deepcopy(bc)
-                    bc_copy.indices = bc_copy.indices[level]
+                    bc_copy = copy.copy(bc)  # shallow copy of the whole object
+                    bc_copy.indices = copy.deepcopy(bc.indices[level])  # deep copy only the modified part
                     bclist_level.append(bc_copy)
 
             # call indices masker for this level
diff --git a/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py b/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
index f88f3c3d..023b7da7 100644
--- a/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
+++ b/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
@@ -1,62 +1,29 @@
-from functools import partial
-import jax.numpy as jnp
-from jax import jit
 import warp as wp
-import os
-
 import neon
 from typing import Any
-
 from xlb.compute_backend import ComputeBackend
-from xlb.operator.equilibrium import Equilibrium
+from xlb.operator.equilibrium import QuadraticEquilibrium
 from xlb.operator import Operator
 
 
-class MultiresQuadraticEquilibrium(Equilibrium):
+class MultiresQuadraticEquilibrium(QuadraticEquilibrium):
     """
     Quadratic equilibrium of Boltzmann equation using hermite polynomials.
     Standard equilibrium model for LBM.
     """
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if self.compute_backend in [ComputeBackend.JAX, ComputeBackend.WARP]:
+            raise NotImplementedError(f"Operator {self.__class__.__name__} not supported in {self.compute_backend} backend.")
+
     def _construct_neon(self):
-        import neon
+        # Use the warp functional for the NEON backend
+        functional, _ = self._construct_warp()
 
         # Set local constants TODO: This is a hack and should be fixed with warp update
-        _c = self.velocity_set.c
-        _w = self.velocity_set.w
-        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
         _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
 
-        # Construct the equilibrium functional
-        @wp.func
-        def functional(
-            rho: Any,
-            u: Any,
-        ):
-            # Allocate the equilibrium
-            feq = _f_vec()
-
-            # Compute the equilibrium
-            for l in range(self.velocity_set.q):
-                # Compute cu
-                cu = self.compute_dtype(0.0)
-                for d in range(self.velocity_set.d):
-                    if _c[d, l] == 1:
-                        cu += u[d]
-                    elif _c[d, l] == -1:
-                        cu -= u[d]
-                cu *= self.compute_dtype(3.0)
-
-                # Compute usqr
-                usqr = self.compute_dtype(1.5) * wp.dot(u, u)
-
-                # Compute feq
-                feq[l] = rho * _w[l] * (self.compute_dtype(1.0) + cu * (self.compute_dtype(1.0) + self.compute_dtype(0.5) * cu) - usqr)
-
-            return feq
-
-        import typing
-
         @neon.Container.factory(name="QuadraticEquilibrium")
         def container(
             level,
@@ -72,7 +39,7 @@ def quadratic_equilibrium_ll(loader: neon.Loader):
                 f_pn = loader.get_mres_write_handle(f)
 
                 @wp.func
-                def quadratic_equilibrium_cl(index: typing.Any):
+                def quadratic_equilibrium_cl(index: Any):
                     _u = _u_vec()
                     for d in range(self.velocity_set.d):
                         _u[d] = wp.neon_read(u_pn, index, d)
diff --git a/xlb/operator/equilibrium/quadratic_equilibrium.py b/xlb/operator/equilibrium/quadratic_equilibrium.py
index c9fc05d7..646d1425 100644
--- a/xlb/operator/equilibrium/quadratic_equilibrium.py
+++ b/xlb/operator/equilibrium/quadratic_equilibrium.py
@@ -18,6 +18,9 @@ class QuadraticEquilibrium(Equilibrium):
     Standard equilibrium model for LBM.
     """
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
     @Operator.register_backend(ComputeBackend.JAX)
     @partial(jit, static_argnums=(0))
     def jax_implementation(self, rho, u):
@@ -101,44 +104,14 @@ def warp_implementation(self, rho, u, f):
         return f
 
     def _construct_neon(self):
-        import neon
+        import neon, typing
+
+        # Use the warp functional for the NEON backend
+        functional, _ = self._construct_warp()
 
         # Set local constants TODO: This is a hack and should be fixed with warp update
-        _c = self.velocity_set.c
-        _w = self.velocity_set.w
-        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
         _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
 
-        # Construct the equilibrium functional
-        @wp.func
-        def functional(
-            rho: Any,
-            u: Any,
-        ):
-            # Allocate the equilibrium
-            feq = _f_vec()
-
-            # Compute the equilibrium
-            for l in range(self.velocity_set.q):
-                # Compute cu
-                cu = self.compute_dtype(0.0)
-                for d in range(self.velocity_set.d):
-                    if _c[d, l] == 1:
-                        cu += u[d]
-                    elif _c[d, l] == -1:
-                        cu -= u[d]
-                cu *= self.compute_dtype(3.0)
-
-                # Compute usqr
-                usqr = self.compute_dtype(1.5) * wp.dot(u, u)
-
-                # Compute feq
-                feq[l] = rho * _w[l] * (self.compute_dtype(1.0) + cu * (self.compute_dtype(1.0) + self.compute_dtype(0.5) * cu) - usqr)
-
-            return feq
-
-        import neon, typing
-
         @neon.Container.factory(name="QuadraticEquilibrium")
         def container(
             rho: Any,
diff --git a/xlb/operator/macroscopic/macroscopic.py b/xlb/operator/macroscopic/macroscopic.py
index eed2ac10..4fd464ad 100644
--- a/xlb/operator/macroscopic/macroscopic.py
+++ b/xlb/operator/macroscopic/macroscopic.py
@@ -26,14 +26,12 @@ def jax_implementation(self, f):
         return rho, u
 
     def _construct_warp(self):
-        zero_moment_func = self.zero_moment.warp_functional
-        first_moment_func = self.first_moment.warp_functional
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
 
         @wp.func
         def functional(f: _f_vec):
-            rho = zero_moment_func(f)
-            u = first_moment_func(f, rho)
+            rho = self.zero_moment.warp_functional(f)
+            u = self.first_moment.warp_functional(f, rho)
             return rho, u
 
         @wp.kernel
@@ -66,70 +64,20 @@ def warp_implementation(self, f, rho, u):
         return rho, u
 
     def _construct_neon(self):
-        zero_moment_func = self.zero_moment.neon_functional
-        first_moment_func = self.first_moment.neon_functional
-        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
-
-        @wp.func
-        def functional(f: _f_vec):
-            rho = zero_moment_func(f)
-            u = first_moment_func(f, rho)
-            return rho, u
-
-        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
-
         import neon, typing
 
-        @neon.Container.factory("macroscopic")
-        def container(
-            f_field: Any,
-            rho_field: Any,
-            u_fild: Any,
-        ):
-            _d = self.velocity_set.d
-
-            def macroscopic_ll(loader: neon.Loader):
-                loader.set_grid(f_field.get_grid())
-
-                rho = loader.get_read_handle(rho_field)
-                u = loader.get_read_handle(u_fild)
-                f = loader.get_read_handle(f_field)
-
-                @wp.func
-                def macroscopic_cl(gIdx: typing.Any):
-                    _f = _f_vec()
-                    for l in range(self.velocity_set.q):
-                        _f[l] = wp.neon_read(f, gIdx, l)
-                    _rho, _u = functional(_f)
-                    wp.neon_write(rho, gIdx, 0, _rho)
-                    for d in range(_d):
-                        wp.neon_write(u, gIdx, d, _u[d])
-
-                loader.declare_kernel(macroscopic_cl)
-
-            return macroscopic_ll
-
-        return functional, container
-
-    def _construct_neon_visual(self):
-        zero_moment_func = self.zero_moment.neon_functional
-        first_moment_func = self.first_moment.neon_functional
-        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
-
-        @wp.func
-        def functional(f: _f_vec):
-            rho = zero_moment_func(f)
-            u = first_moment_func(f, rho)
-            return rho, u
+        # Redefine the zero and first moment operators for the neon backend
+        # This is because the neon backend relies on the warp functionals for its operations.
+        self.zero_moment = ZeroMoment(compute_backend=ComputeBackend.WARP)
+        self.first_moment = FirstMoment(compute_backend=ComputeBackend.WARP)
+        functional, _ = self._construct_warp()
 
+        # Set local vectors
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
 
-        import neon, typing
-
         @neon.Container.factory("macroscopic")
         def container(
             f_field: Any,
-            bc_mask: Any,
             rho_field: Any,
             u_fild: Any,
         ):
@@ -141,25 +89,13 @@ def macroscopic_ll(loader: neon.Loader):
                 rho = loader.get_read_handle(rho_field)
                 u = loader.get_read_handle(u_fild)
                 f = loader.get_read_handle(f_field)
-                bc_mask_pn = loader.get_read_handle(bc_mask)
 
                 @wp.func
                 def macroscopic_cl(gIdx: typing.Any):
                     _f = _f_vec()
-                    _boundary_id = wp.neon_read(bc_mask_pn, gIdx, 0)
-
                     for l in range(self.velocity_set.q):
                         _f[l] = wp.neon_read(f, gIdx, l)
                     _rho, _u = functional(_f)
-                    if _boundary_id != wp.uint8(0):
-                        _rho = self.compute_dtype(1.0)
-                        for d in range(_d):
-                            _u[d] = self.compute_dtype(0.0)
-                    if _boundary_id == wp.uint8(255):
-                        _rho = self.compute_dtype(0.0)
-                        for d in range(_d):
-                            _u[d] = self.compute_dtype(0.0)
-
                     wp.neon_write(rho, gIdx, 0, _rho)
                     for d in range(_d):
                         wp.neon_write(u, gIdx, d, _u[d])
diff --git a/xlb/operator/macroscopic/multires_macroscopic.py b/xlb/operator/macroscopic/multires_macroscopic.py
index 0a5bc92e..12e5b76b 100644
--- a/xlb/operator/macroscopic/multires_macroscopic.py
+++ b/xlb/operator/macroscopic/multires_macroscopic.py
@@ -6,73 +6,26 @@
 
 from xlb.compute_backend import ComputeBackend
 from xlb.operator.operator import Operator
-from xlb.operator.macroscopic.zero_moment import ZeroMoment
-from xlb.operator.macroscopic.first_moment import FirstMoment
+from xlb.operator.macroscopic import Macroscopic, ZeroMoment, FirstMoment
 
 
-class MultiresMacroscopic(Operator):
-    """A class to compute both zero and first moments of distribution functions (rho, u)."""
+class MultiresMacroscopic(Macroscopic):
+    """A class to compute both zero and first moments of distribution functions (rho, u) on a multi-resolution grid."""
 
     def __init__(self, *args, **kwargs):
-        self.zero_moment = ZeroMoment(*args, **kwargs)
-        self.first_moment = FirstMoment(*args, **kwargs)
         super().__init__(*args, **kwargs)
-
-    def _construct_warp(self):
-        zero_moment_func = self.zero_moment.warp_functional
-        first_moment_func = self.first_moment.warp_functional
-        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
-
-        @wp.func
-        def functional(f: _f_vec):
-            rho = zero_moment_func(f)
-            u = first_moment_func(f, rho)
-            return rho, u
-
-        @wp.kernel
-        def kernel(
-            f: wp.array4d(dtype=Any),
-            rho: wp.array4d(dtype=Any),
-            u: wp.array4d(dtype=Any),
-        ):
-            i, j, k = wp.tid()
-            index = wp.vec3i(i, j, k)
-
-            _f = _f_vec()
-            for l in range(self.velocity_set.q):
-                _f[l] = f[l, index[0], index[1], index[2]]
-            _rho, _u = functional(_f)
-
-            rho[0, index[0], index[1], index[2]] = self.store_dtype(_rho)
-            for d in range(self.velocity_set.d):
-                u[d, index[0], index[1], index[2]] = self.store_dtype(_u[d])
-
-        return functional, kernel
-
-    @Operator.register_backend(ComputeBackend.WARP)
-    def warp_implementation(self, f, rho, u):
-        wp.launch(
-            self.warp_kernel,
-            inputs=[f, rho, u],
-            dim=rho.shape[1:],
-        )
-        return rho, u
+        if self.compute_backend in [ComputeBackend.JAX, ComputeBackend.WARP]:
+            raise NotImplementedError(f"Operator {self.__class__.__name__} not supported in {self.compute_backend} backend.")
 
     def _construct_neon(self):
-        zero_moment_func = self.zero_moment.neon_functional
-        first_moment_func = self.first_moment.neon_functional
-        print(f"VELOCITY SET: {self.velocity_set.q}")
-        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
-
-        @wp.func
-        def functional(f: _f_vec):
-            rho = zero_moment_func(f)
-            u = first_moment_func(f, rho)
-            return rho, u
+        import neon, typing
 
+        # Redefine the zero and first moment operators for the neon backend
+        # This is because the neon backend relies on the warp functionals for its operations.
+        self.zero_moment = ZeroMoment(compute_backend=ComputeBackend.WARP)
+        self.first_moment = FirstMoment(compute_backend=ComputeBackend.WARP)
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
-
-        import neon, typing
+        functional, _ = self._construct_warp()
 
         @neon.Container.factory("macroscopic")
         def container(

From 6664392e059265965414bfa37233f1034e11bcea Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Sat, 24 May 2025 22:23:52 -0400
Subject: [PATCH 047/208] added back post-collision apply_bc and the results
 are correct.

---
 xlb/operator/stepper/nse_multires_stepper.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 93ad346a..da1d926e 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -366,7 +366,9 @@ def device(index: typing.Any):
                         _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
 
                         # Apply post-collision boundary conditions
-                        # _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False)
+                        _f_post_collision = apply_bc(
+                            index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False
+                        )
 
                         for l in range(self.velocity_set.q):
                             push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))

From 7ed39c590314dfb1c45454208d480e0869a57a30 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 26 May 2025 14:36:17 -0400
Subject: [PATCH 048/208] WIP: extrapolation outflow in Neon

---
 xlb/operator/boundary_condition/bc_extrapolation_outflow.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
index 884e691e..1fa6f412 100644
--- a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
+++ b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
@@ -54,13 +54,14 @@ def __init__(
         )
 
         # find and store the normal vector using indices
-        self._get_normal_vec(indices)
+        if compute_backend == ComputeBackend.JAX:
+            self._get_normal_vectors(indices)
 
         # Unpack the two warp functionals needed for this BC!
         if self.compute_backend == ComputeBackend.WARP:
             self.warp_functional, self.update_bc_auxilary_data = self.warp_functional
 
-    def _get_normal_vec(self, indices):
+    def _get_normal_vectors(self, indices):
         # Get the frequency count and most common element directly
         freq_counts = [Counter(coord).most_common(1)[0] for coord in indices]
 

From 1f11b7ce4faac09477986ddda73dc22ee04bda83 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 26 May 2025 14:42:51 -0400
Subject: [PATCH 049/208] added neon_apply_aux_recovery_bc, used center_index
 and more refactoring

---
 xlb/helper/initializers.py                    |  2 +-
 .../mulltires_quadratic_equilibrium.py        |  6 +-
 xlb/operator/force/momentum_transfer.py       |  7 +--
 xlb/operator/stepper/nse_multires_stepper.py  | 18 +++---
 xlb/operator/stepper/nse_stepper.py           | 62 ++++++++++++-------
 5 files changed, 55 insertions(+), 40 deletions(-)

diff --git a/xlb/helper/initializers.py b/xlb/helper/initializers.py
index d94cfa0f..82e62443 100644
--- a/xlb/helper/initializers.py
+++ b/xlb/helper/initializers.py
@@ -28,5 +28,5 @@ def initialize_multires_eq(f, grid, velocity_set, precision_policy, backend, rho
     equilibrium = MultiresQuadraticEquilibrium()
     for level in range(grid.count_levels):
         print("MultiresQuadraticEquilibrium")
-        equilibrium(level=level, rho=rho, u=u, f=f, stream=0)
+        equilibrium(rho=rho, u=u, f=f, level=level, stream=0)
     return f
diff --git a/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py b/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
index 023b7da7..c4eb43b5 100644
--- a/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
+++ b/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
@@ -26,10 +26,10 @@ def _construct_neon(self):
 
         @neon.Container.factory(name="QuadraticEquilibrium")
         def container(
-            level,
             rho: Any,
             u: Any,
             f: Any,
+            level,
         ):
             def quadratic_equilibrium_ll(loader: neon.Loader):
                 loader.set_mres_grid(rho.get_grid(), level)
@@ -60,8 +60,8 @@ def quadratic_equilibrium_cl(index: Any):
         return functional, container
 
     @Operator.register_backend(ComputeBackend.NEON)
-    def neon_implementation(self, level, rho, u, f, stream):
-        c = self.neon_container(level, rho, u, f)
+    def neon_implementation(self, rho, u, f, level, stream):
+        c = self.neon_container(rho, u, f, level)
         c.run(stream, container_runtime=neon.Container.ContainerRuntime.neon)
 
         return f
diff --git a/xlb/operator/force/momentum_transfer.py b/xlb/operator/force/momentum_transfer.py
index dda43650..b1f18478 100644
--- a/xlb/operator/force/momentum_transfer.py
+++ b/xlb/operator/force/momentum_transfer.py
@@ -99,10 +99,7 @@ def _construct_warp(self):
         _no_slip_id = self.no_slip_bc_instance.id
 
         # Find velocity index for 0, 0, 0
-        for l in range(self.velocity_set.q):
-            if _c[0, l] == 0 and _c[1, l] == 0 and _c[2, l] == 0:
-                zero_index = l
-        _zero_index = wp.int32(zero_index)
+        lattice_central_index = self.velocity_set.center_index
 
         # Construct the warp kernel
         @wp.kernel
@@ -130,7 +127,7 @@ def kernel(
             # Determin if boundary is an edge by checking if center is missing
             is_edge = wp.bool(False)
             if _boundary_id == wp.uint8(_no_slip_id):
-                if _missing_mask[_zero_index] == wp.uint8(0):
+                if _missing_mask[lattice_central_index] == wp.uint8(0):
                     is_edge = wp.bool(True)
 
             # If the boundary is an edge then add the momentum transfer
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index da1d926e..822cef5f 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -109,10 +109,9 @@ def ll_coalescence_count(loader: neon.Loader):
 
                 _c = self.velocity_set.c
                 _w = self.velocity_set.w
-                import typing
 
                 @wp.func
-                def cl_collide_coarse(index: typing.Any):
+                def cl_collide_coarse(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id == wp.uint8(255):
                         return
@@ -141,10 +140,9 @@ def loading(loader: neon.Loader):
 
                 _c = self.velocity_set.c
                 _w = self.velocity_set.w
-                import typing
 
                 @wp.func
-                def compute(index: typing.Any):
+                def compute(index: Any):
                     # _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     # if _boundary_id == wp.uint8(255):
                     #     return
@@ -346,7 +344,7 @@ def ll_collide_coarse(loader: neon.Loader):
                 _w = self.velocity_set.w
 
                 @wp.func
-                def device(index: typing.Any):
+                def device(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     """
                     The c++ version starts with the following, which I am not sure is right:
@@ -375,13 +373,13 @@ def device(index: typing.Any):
                             if level < num_levels - 1:
                                 val = _f_post_collision[l]
                                 wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, val)
-                                wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, val)
+                                # Verified that this is not needed: wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, val)
 
                             wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
                     else:
                         for l in range(self.velocity_set.q):
                             wp.neon_write(f_1_pn, index, l, self.compute_dtype(0))
-                            wp.neon_write(f_0_pn, index, l, self.compute_dtype(0))
+                            # Verified that this is not needed: wp.neon_write(f_0_pn, index, l, self.compute_dtype(0))
 
                 loader.declare_kernel(device)
 
@@ -421,7 +419,7 @@ def ll_stream_coarse(loader: neon.Loader):
                 coalescence_factor_pn = loader.get_mres_read_handle(coalescence_factor_fd)
 
                 @wp.func
-                def cl_stream_coarse(index: typing.Any):
+                def cl_stream_coarse(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id == wp.uint8(255):
                         return
@@ -529,7 +527,7 @@ def ll_stream_coarse(loader: neon.Loader):
                 _c = self.velocity_set.c
 
                 @wp.func
-                def cl_stream_coarse(index: typing.Any):
+                def cl_stream_coarse(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id == wp.uint8(255):
                         return
@@ -577,7 +575,7 @@ def ll_stream_coarse(loader: neon.Loader):
                 _w = self.velocity_set.w
 
                 @wp.func
-                def cl_stream_coarse(index: typing.Any):
+                def cl_stream_coarse(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id == wp.uint8(255):
                         return
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 33b8690b..e2a58dbc 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -215,20 +215,16 @@ def _construct_warp(self):
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
         _missing_mask_vec = wp.vec(self.velocity_set.q, dtype=wp.uint8)
         _opp_indices = self.velocity_set.opp_indices
+        lattice_central_index = self.velocity_set.center_index
 
         # Read the list of bc_to_id created upon instantiation
         bc_to_id = boundary_condition_registry.bc_to_id
-        id_to_bc = boundary_condition_registry.id_to_bc
 
         # Gather IDs of ExtrapolationOutflowBC boundary conditions
         extrapolation_outflow_bc_ids = []
         for bc_name, bc_id in bc_to_id.items():
             if bc_name.startswith("ExtrapolationOutflowBC"):
                 extrapolation_outflow_bc_ids.append(bc_id)
-        # Group active boundary conditions
-        active_bcs = set(boundary_condition_registry.id_to_bc[bc.id] for bc in self.boundary_conditions)
-
-        _opp_indices = self.velocity_set.opp_indices
 
         @wp.func
         def apply_bc(
@@ -301,12 +297,13 @@ def apply_aux_recovery_bc(
             for i in range(wp.static(len(self.boundary_conditions))):
                 if wp.static(self.boundary_conditions[i].needs_aux_recovery):
                     if _boundary_id == wp.static(self.boundary_conditions[i].id):
-                        # Perform the swapping of data
-                        # (i) Recover the values stored in the central index of f_1
-                        f_0[0, index[0], index[1], index[2]] = self.store_dtype(_f1_thread[0])
-                        # (ii) Recover the values stored in the missing directions of f_1
-                        for l in range(1, self.velocity_set.q):
-                            if _missing_mask[l] == wp.uint8(1):
+                        for l in range(self.velocity_set.q):
+                            # Perform the swapping of data
+                            if l == lattice_central_index:
+                                # (i) Recover the values stored in the central index of f_1
+                                f_0[l, index[0], index[1], index[2]] = self.store_dtype(_f1_thread[l])
+                            elif _missing_mask[l] == wp.uint8(1):
+                                # (ii) Recover the values stored in the missing directions of f_1
                                 f_0[_opp_indices[l], index[0], index[1], index[2]] = self.store_dtype(_f1_thread[_opp_indices[l]])
 
         @wp.kernel
@@ -364,19 +361,17 @@ def _construct_neon(self):
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
         _missing_mask_vec = wp.vec(self.velocity_set.q, dtype=wp.uint8)
         _opp_indices = self.velocity_set.opp_indices
+        lattice_central_index = self.velocity_set.center_index
         # _cast_to_store_dtype = self.store_dtype()
 
         # Read the list of bc_to_id created upon instantiation
         bc_to_id = boundary_condition_registry.bc_to_id
-        id_to_bc = boundary_condition_registry.id_to_bc
 
         # Gather IDs of ExtrapolationOutflowBC boundary conditions
         extrapolation_outflow_bc_ids = []
         for bc_name, bc_id in bc_to_id.items():
             if bc_name.startswith("ExtrapolationOutflowBC"):
                 extrapolation_outflow_bc_ids.append(bc_id)
-        # Group active boundary conditions
-        active_bcs = set(boundary_condition_registry.id_to_bc[bc.id] for bc in self.boundary_conditions)
 
         @wp.func
         def apply_bc(
@@ -428,7 +423,34 @@ def neon_get_thread_data(
 
             return _f0_thread, _f1_thread, _missing_mask
 
-        import typing
+        @wp.func
+        def neon_apply_aux_recovery_bc(
+            index: Any,
+            _boundary_id: Any,
+            _missing_mask: Any,
+            f_0_pn: Any,
+            _f1_thread: Any,
+        ):
+            # Note:
+            # In XLB, the BC auxiliary data (e.g. prescribed values of pressure or normal velocity) are stored in (i) central index of f_1 and/or
+            # (ii) missing directions of f_1. Some BCs may or may not need all these available storage space. This function checks whether
+            # the BC needs recovery of auxiliary data and then recovers the information for the next iteration (due to buffer swapping) by
+            # writting the thread values of f_1 (i.e._f1_thread) into f_0.
+
+            # Unroll the loop over boundary conditions
+            for i in range(wp.static(len(self.boundary_conditions))):
+                if wp.static(self.boundary_conditions[i].needs_aux_recovery):
+                    if _boundary_id == wp.static(self.boundary_conditions[i].id):
+                        for l in range(self.velocity_set.q):
+                            # Perform the swapping of data
+                            if l == lattice_central_index:
+                                # (i) Recover the values stored in the central index of f_1
+                                # TODO: Add store dtype
+                                wp.neon_write(f_0_pn, index, l, _f1_thread[l])
+                            elif _missing_mask[l] == wp.uint8(1):
+                                # (ii) Recover the values stored in the missing directions of f_1
+                                # TODO: Add store dtype
+                                wp.neon_write(f_0_pn, index, _opp_indices[l], _f1_thread[_opp_indices[l]])
 
         @neon.Container.factory(name="nse_stepper")
         def container(
@@ -451,7 +473,7 @@ def nse_stepper_ll(loader: neon.Loader):
                 f_1_pn = loader.get_write_handle(f_1_fd)
 
                 @wp.func
-                def nse_stepper_cl(index: typing.Any):
+                def nse_stepper_cl(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id == wp.uint8(255):
                         return
@@ -473,13 +495,11 @@ def nse_stepper_cl(index: typing.Any):
                         index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False
                     )
 
+                    # Apply auxiliary recovery for boundary conditions (swapping)
+                    neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, _f1_thread)
+
                     # Store the result in f_1
                     for l in range(self.velocity_set.q):
-                        # TODO: Improve this later
-                        if wp.static("GradsApproximationBC" in active_bcs):
-                            if _boundary_id == wp.static(boundary_condition_registry.bc_to_id["GradsApproximationBC"]):
-                                if _missing_mask[l] == wp.uint8(1):
-                                    wp.neon_write(f_0_pn, index, _opp_indices[l], _f1_thread[_opp_indices[l]])
                         wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
 
                 loader.declare_kernel(nse_stepper_cl)

From 0fdf77a4b3aabcf741bcd37a1b6dce1c686f24b0 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 26 May 2025 15:59:40 -0400
Subject: [PATCH 050/208] Always using Warp backend when creating bc helper
 class to support Neon as well.

---
 xlb/operator/boundary_condition/bc_regularized.py | 6 ++----
 xlb/operator/boundary_condition/bc_zouhe.py       | 7 +++----
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/xlb/operator/boundary_condition/bc_regularized.py b/xlb/operator/boundary_condition/bc_regularized.py
index 1950fc1b..15a37d8b 100644
--- a/xlb/operator/boundary_condition/bc_regularized.py
+++ b/xlb/operator/boundary_condition/bc_regularized.py
@@ -124,12 +124,10 @@ def jax_implementation(self, f_pre, f_post, bc_mask, missing_mask):
         return f_post
 
     def _construct_warp(self):
-        # load helper functions
-        bc_helper = HelperFunctionsBC(velocity_set=self.velocity_set, precision_policy=self.precision_policy, compute_backend=self.compute_backend)
+        # load helper functions. Always use warp backend for helper functions as it may also be called by the Neon backend.
+        bc_helper = HelperFunctionsBC(velocity_set=self.velocity_set, precision_policy=self.precision_policy, compute_backend=ComputeBackend.WARP)
         # Set local constants
         _d = self.velocity_set.d
-        _q = self.velocity_set.q
-        _opp_indices = self.velocity_set.opp_indices
 
         @wp.func
         def functional_velocity(
diff --git a/xlb/operator/boundary_condition/bc_zouhe.py b/xlb/operator/boundary_condition/bc_zouhe.py
index a1812255..eb179496 100644
--- a/xlb/operator/boundary_condition/bc_zouhe.py
+++ b/xlb/operator/boundary_condition/bc_zouhe.py
@@ -269,12 +269,11 @@ def jax_implementation(self, f_pre, f_post, bc_mask, missing_mask):
         return f_post
 
     def _construct_warp(self):
-        # load helper functions
-        bc_helper = HelperFunctionsBC(velocity_set=self.velocity_set, precision_policy=self.precision_policy, compute_backend=self.compute_backend)
+        # load helper functions. Always use warp backend for helper functions as it may also be called by the Neon backend.
+        bc_helper = HelperFunctionsBC(velocity_set=self.velocity_set, precision_policy=self.precision_policy, compute_backend=ComputeBackend.WARP)
+
         # Set local constants
         _d = self.velocity_set.d
-        _q = self.velocity_set.q
-        _opp_indices = self.velocity_set.opp_indices
 
         @wp.func
         def functional_velocity(

From bf59ebc884afb463ff434ccf0643a615116542a5 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 26 May 2025 21:53:57 -0400
Subject: [PATCH 051/208] fixed a bug

---
 examples/performance/mlups_3d_multires.py              | 2 +-
 examples/performance/mlups_3d_multires_single_level.py | 2 +-
 xlb/grid/grid.py                                       | 2 ++
 xlb/helper/__init__.py                                 | 2 +-
 xlb/helper/simulation_manager.py                       | 3 ++-
 xlb/operator/stepper/nse_multires_stepper.py           | 3 +--
 6 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/examples/performance/mlups_3d_multires.py b/examples/performance/mlups_3d_multires.py
index 479e00fc..96a87e1d 100644
--- a/examples/performance/mlups_3d_multires.py
+++ b/examples/performance/mlups_3d_multires.py
@@ -209,7 +209,7 @@ def run(velocity_set, grid_shape, num_steps):
     omega = 1.0 / (3.0 * visc + 0.5)
     # omega = 1.0
 
-    sim = xlb.helper.MultiResSimulationManager(grid, velocity_set, stepper, omega)
+    sim = xlb.helper.MultiresSimulationManager(grid, velocity_set, stepper, omega)
 
     # sim.export_macroscopic("Initial_")
     # sim.step()
diff --git a/examples/performance/mlups_3d_multires_single_level.py b/examples/performance/mlups_3d_multires_single_level.py
index 0c5614e4..4bbeb0a5 100644
--- a/examples/performance/mlups_3d_multires_single_level.py
+++ b/examples/performance/mlups_3d_multires_single_level.py
@@ -126,7 +126,7 @@ def peel(dim, idx, peel_level, outwards):
     # # Initialize fields and run simulation
     # omega = 1.0
 
-    sim = xlb.helper.MultiResSimulationManager(grid, velocity_set, stepper, omega)
+    sim = xlb.helper.MultiresSimulationManager(grid, velocity_set, stepper, omega)
 
     sim.export_macroscopic("Initial_")
 
diff --git a/xlb/grid/grid.py b/xlb/grid/grid.py
index 7ae34684..fd831675 100644
--- a/xlb/grid/grid.py
+++ b/xlb/grid/grid.py
@@ -13,6 +13,7 @@ def grid_factory(
     velocity_set=None,
 ):
     compute_backend = compute_backend or DefaultConfig.default_backend
+    velocity_set = velocity_set or DefaultConfig.velocity_set
     if compute_backend == ComputeBackend.WARP:
         from xlb.grid.warp_grid import WarpGrid
 
@@ -37,6 +38,7 @@ def multires_grid_factory(
     sparsity_pattern_origins: List[neon.Index_3d] = [],
 ):
     compute_backend = compute_backend or DefaultConfig.default_backend
+    velocity_set = velocity_set or DefaultConfig.velocity_set
     if compute_backend == ComputeBackend.NEON:
         from xlb.grid.multires_grid import NeonMultiresGrid
 
diff --git a/xlb/helper/__init__.py b/xlb/helper/__init__.py
index a6f0dd7c..d6aa42c3 100644
--- a/xlb/helper/__init__.py
+++ b/xlb/helper/__init__.py
@@ -2,4 +2,4 @@
 from xlb.helper.initializers import initialize_eq
 from xlb.helper.initializers import initialize_multires_eq
 from xlb.helper.check_boundary_overlaps import check_bc_overlaps
-from xlb.helper.simulation_manager import MultiResSimulationManager
+from xlb.helper.simulation_manager import MultiresSimulationManager
diff --git a/xlb/helper/simulation_manager.py b/xlb/helper/simulation_manager.py
index 5df80687..bfbac21d 100644
--- a/xlb/helper/simulation_manager.py
+++ b/xlb/helper/simulation_manager.py
@@ -1,7 +1,7 @@
 import neon
 
 
-class MultiResSimulationManager:
+class MultiresSimulationManager:
     def __init__(self, grid, velocity_set, stepper, omega):
         self.stepper = stepper
         self.grid = stepper.get_grid()
@@ -109,6 +109,7 @@ def recurtion(level, app):
             recurtion(level - 1, app)
             recurtion(level - 1, app)
 
+            # Important: swapping of f_0 and f_1 is done here
             print(f"RECURTION Level {level}, stream_coarse_step_ABC")
             self.stepper.add_to_app(
                 app=app,
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 822cef5f..b67282a9 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -493,7 +493,6 @@ def cl_stream_coarse(index: Any):
 
                     for l in range(self.velocity_set.q):
                         wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
-                    # wp.print("stream_coarse")
 
                 loader.declare_kernel(cl_stream_coarse)
 
@@ -663,7 +662,7 @@ def ll_stream_coarse(loader: neon.Loader):
                 _c = self.velocity_set.c
 
                 @wp.func
-                def cl_stream_coarse(index: typing.Any):
+                def cl_stream_coarse(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id == wp.uint8(255):
                         return

From 9e9580f88b655c7f0ad5d5342920bce814e0b44c Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 27 May 2025 00:37:08 -0400
Subject: [PATCH 052/208] Compelted Encoding/Decoding of BC aux data for
 ZouHe/Regularized for Multires

---
 .../grid_refinement/flow_past_sphere_3d.py    | 179 ++++++++++++++++++
 examples/performance/mlups_3d_multires.py     |   6 +-
 .../boundary_condition/bc_regularized.py      |  17 +-
 xlb/operator/boundary_condition/bc_zouhe.py   |  23 ++-
 .../boundary_condition/boundary_condition.py  | 146 ++++++++++++++
 .../boundary_condition/helper_functions_bc.py |  22 +++
 xlb/operator/stepper/nse_multires_stepper.py  |  41 +++-
 7 files changed, 423 insertions(+), 11 deletions(-)
 create mode 100644 examples/cfd/grid_refinement/flow_past_sphere_3d.py

diff --git a/examples/cfd/grid_refinement/flow_past_sphere_3d.py b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
new file mode 100644
index 00000000..9a1e891b
--- /dev/null
+++ b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
@@ -0,0 +1,179 @@
+import xlb
+from xlb.compute_backend import ComputeBackend
+from xlb.precision_policy import PrecisionPolicy
+from xlb.grid import multires_grid_factory
+from xlb.operator.stepper import MultiresIncompressibleNavierStokesStepper
+from xlb.operator.boundary_condition import FullwayBounceBackBC, HalfwayBounceBackBC, RegularizedBC, ExtrapolationOutflowBC, DoNothingBC, ZouHeBC
+import neon
+import warp as wp
+import numpy as np
+import jax.numpy as jnp
+import time
+
+# -------------------------- Simulation Setup --------------------------
+
+omega = 1.6
+grid_shape = (256 // 2, 256 // 2, 256 // 2)
+compute_backend = ComputeBackend.NEON
+precision_policy = PrecisionPolicy.FP32FP32
+velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=compute_backend)
+u_max = 0.04
+num_steps = 2000
+post_process_interval = 100
+
+# Initialize XLB
+xlb.init(
+    velocity_set=velocity_set,
+    default_backend=compute_backend,
+    default_precision_policy=precision_policy,
+)
+
+# Create the multires grid
+# TODO: with rectangular cuboid for the inner box, there are some issues with the
+#       multires_grid_factory. The inner box should be a cube for now!
+nx, ny, nz = grid_shape
+sphere_origin = (nx // 2, ny // 2, nz // 2)
+sphere_radius = ny // 12
+inner_box_shape = (6 * sphere_radius, 6 * sphere_radius, 6 * sphere_radius)
+num_levels = 2
+level_1 = np.ones((nx // 2, ny // 2, nz // 2), dtype=int)
+level_0 = np.ones(inner_box_shape, dtype=int)
+level_0 = np.ascontiguousarray(level_0, dtype=np.int32)
+levels = [level_0, level_1]
+level_origins = [((nx - inner_box_shape[0]) // 2, (ny - inner_box_shape[1]) // 2, (nz - inner_box_shape[2]) // 2), (0, 0, 0)]
+
+grid = multires_grid_factory(
+    grid_shape,
+    velocity_set=velocity_set,
+    sparsity_pattern_list=[level_0, level_1],
+    sparsity_pattern_origins=[neon.Index_3d(*level_origins[lvl]) for lvl in range(num_levels)],
+)
+
+# Define Boundary Indices
+coarsest_level = grid.count_levels - 1
+box = grid.bounding_box_indices(shape=grid.level_to_shape(coarsest_level))
+box_no_edge = grid.bounding_box_indices(shape=grid.level_to_shape(coarsest_level), remove_edges=True)
+inlet = box_no_edge["left"]
+outlet = box_no_edge["right"]
+walls = [box["bottom"][i] + box["top"][i] + box["front"][i] + box["back"][i] for i in range(velocity_set.d)]
+walls = np.unique(np.array(walls), axis=-1).tolist()
+
+# sphere at the finest level
+x = np.arange(nx)
+y = np.arange(ny)
+z = np.arange(nz)
+X, Y, Z = np.meshgrid(x, y, z, indexing="ij")
+indices = np.where((X - sphere_origin[0]) ** 2 + (Y - sphere_origin[1]) ** 2 + (Z - sphere_origin[2]) ** 2 < sphere_radius**2)
+sphere = [tuple(indices[i]) for i in range(velocity_set.d)]
+
+# Convert bc indices to a list of list (first entry corresponds to the finest level)
+inlet = [[], inlet]
+outlet = [[], outlet]
+walls = [[], walls]
+sphere = [sphere, []]
+
+
+# Define Boundary Conditions
+def bc_profile():
+    H_y = float(ny - 1)  # Height in y direction
+    H_z = float(nz - 1)  # Height in z direction
+
+    if compute_backend == ComputeBackend.JAX:
+
+        def bc_profile_jax():
+            y = jnp.arange(ny)
+            z = jnp.arange(nz)
+            Y, Z = jnp.meshgrid(y, z, indexing="ij")
+
+            # Calculate normalized distance from center
+            y_center = Y - (H_y / 2.0)
+            z_center = Z - (H_z / 2.0)
+            r_squared = (2.0 * y_center / H_y) ** 2.0 + (2.0 * z_center / H_z) ** 2.0
+
+            # Parabolic profile for x velocity, zero for y and z
+            u_x = u_max * jnp.maximum(0.0, 1.0 - r_squared)
+            u_y = jnp.zeros_like(u_x)
+            u_z = jnp.zeros_like(u_x)
+
+            return jnp.stack([u_x, u_y, u_z])
+
+        return bc_profile_jax
+
+    elif compute_backend == ComputeBackend.WARP:
+
+        @wp.func
+        def bc_profile_warp(index: wp.vec3i):
+            # Poiseuille flow profile: parabolic velocity distribution
+            y = wp.float32(index[1])
+            z = wp.float32(index[2])
+
+            # Calculate normalized distance from center
+            y_center = y - (H_y / 2.0)
+            z_center = z - (H_z / 2.0)
+            r_squared = (2.0 * y_center / H_y) ** 2.0 + (2.0 * z_center / H_z) ** 2.0
+
+            # Parabolic profile: u = u_max * (1 - r²)
+            return wp.vec(u_max * wp.max(0.0, 1.0 - r_squared), length=1)
+
+    elif compute_backend == ComputeBackend.NEON:
+        raise NotImplementedError("BC profile not implemented yet!")
+        # wp.func
+        # def bc_profile_warp(index: Any):
+        #     # Get the refinement level for the current index
+        #     refinement = 2 ** grid.get_level(index)
+        #     cIdx = wp.neon_global_idx(bc_mask_hdl, index)
+        #     # get local indices by dividing the global indices (associated with the finest level) by 2^level
+        #     lx = wp.neon_get_x(cIdx) // refinement
+        #     ly = wp.neon_get_y(cIdx) // refinement
+        #     lz = wp.neon_get_z(cIdx) // refinement
+
+        #     # Poiseuille flow profile: parabolic velocity distribution
+        #     y = wp.float32(index[1])
+        #     z = wp.float32(index[2])
+
+        #     # Calculate normalized distance from center
+        #     y_center = y - (H_y / 2.0)
+        #     z_center = z - (H_z / 2.0)
+        #     r_squared = (2.0 * y_center / H_y) ** 2.0 + (2.0 * z_center / H_z) ** 2.0
+
+        #     # Parabolic profile: u = u_max * (1 - r²)
+        #     return wp.vec(u_max * wp.max(0.0, 1.0 - r_squared), length=1)
+
+    return bc_profile_warp
+
+
+# Initialize Boundary Conditions
+# bc_left = RegularizedBC("velocity", profile=bc_profile(), indices=inlet)
+# Alternatively, use a prescribed velocity profile
+bc_left = RegularizedBC("velocity", prescribed_value=(u_max, 0.0, 0.0), indices=inlet)
+bc_walls = FullwayBounceBackBC(indices=walls)  # TODO: issues with halfway bounce back only here!
+# bc_outlet = ExtrapolationOutflowBC(indices=outlet)
+bc_outlet = DoNothingBC(indices=outlet)
+bc_sphere = HalfwayBounceBackBC(indices=sphere)
+boundary_conditions = [bc_walls, bc_left, bc_outlet, bc_sphere]
+
+# Setup Stepper
+stepper = MultiresIncompressibleNavierStokesStepper(
+    grid=grid,
+    boundary_conditions=boundary_conditions,
+    collision_type="BGK",
+)
+
+# Define a multi-resolution simulation manager
+sim = xlb.helper.MultiresSimulationManager(grid, velocity_set, stepper, omega)
+
+# -------------------------- Simulation Loop --------------------------
+
+wp.synchronize()
+start_time = time.time()
+for step in range(num_steps):
+    sim.step()
+
+    if step % post_process_interval == 0 or step == num_steps - 1:
+        # TODO: Issues in the vtk output for rectangular cuboids (as if a duboid grid with the largest side is assumed)
+        sim.export_macroscopic("multires_flow_over_sphere_3d_")
+        wp.synchronize()
+        end_time = time.time()
+        elapsed = end_time - start_time
+        print(f"Completed step {step}. Time elapsed for {post_process_interval} steps: {elapsed:.6f} seconds.")
+        start_time = time.time()
diff --git a/examples/performance/mlups_3d_multires.py b/examples/performance/mlups_3d_multires.py
index 96a87e1d..14ef7819 100644
--- a/examples/performance/mlups_3d_multires.py
+++ b/examples/performance/mlups_3d_multires.py
@@ -189,15 +189,15 @@ def run(velocity_set, grid_shape, num_steps):
     # walls = construct_indices_per_level(grid_shape, walls, levels_mask, level_origins)
 
     # Example 1: fine to coarse
-    grid, lid, walls = problem1(grid_shape, velocity_set)
+    # grid, lid, walls = problem1(grid_shape, velocity_set)
 
     # Example 2: Coarse to fine:
-    # grid, lid, walls = problem2(grid_shape, velocity_set)
+    grid, lid, walls = problem2(grid_shape, velocity_set)
 
     prescribed_vel = 0.1
     boundary_conditions = [
         EquilibriumBC(rho=1.0, u=(prescribed_vel, 0.0, 0.0), indices=lid),
-        EquilibriumBC(rho=1.0, u=(0.0, 0.0, 0.0), indices=walls),
+        FullwayBounceBackBC(indices=walls),
     ]
 
     # Create stepper
diff --git a/xlb/operator/boundary_condition/bc_regularized.py b/xlb/operator/boundary_condition/bc_regularized.py
index 15a37d8b..ad3560c1 100644
--- a/xlb/operator/boundary_condition/bc_regularized.py
+++ b/xlb/operator/boundary_condition/bc_regularized.py
@@ -128,6 +128,7 @@ def _construct_warp(self):
         bc_helper = HelperFunctionsBC(velocity_set=self.velocity_set, precision_policy=self.precision_policy, compute_backend=ComputeBackend.WARP)
         # Set local constants
         _d = self.velocity_set.d
+        lattice_central_index = self.velocity_set.center_index
 
         @wp.func
         def functional_velocity(
@@ -148,7 +149,7 @@ def functional_velocity(
             # Find the value of u from the missing directions
             # Since we are only considering normal velocity, we only need to find one value (stored at the center of f_1)
             # Create velocity vector by multiplying the prescribed value with the normal vector
-            prescribed_value = f_1[0, index[0], index[1], index[2]]
+            prescribed_value = decode_lattice_center_value(index, f_1)
             _u = -prescribed_value * normals
 
             # calculate rho
@@ -184,7 +185,7 @@ def functional_pressure(
 
             # Find the value of rho from the missing directions
             # Since we need only one scalar value, we only need to find one value (stored at the center of f_1)
-            _rho = f_1[0, index[0], index[1], index[2]]
+            _rho = decode_lattice_center_value(index, f_1)
 
             # calculate velocity
             fsum = bc_helper.get_bc_fsum(_f, missing_mask)
@@ -199,6 +200,18 @@ def functional_pressure(
             _f = bc_helper.regularize_fpop(_f, feq)
             return _f
 
+        @wp.func
+        def decode_lattice_center_value(index: Any, f_1: Any):
+            """
+            Decode the encoded values needed for the boundary condition treatment from the center location in f_1.
+            """
+            if wp.static(self.compute_backend == ComputeBackend.WARP):
+                value = f_1[lattice_central_index, index[0], index[1], index[2]]
+            else:
+                # Note: in Neon case, f_1 is a pointer to the field not the actual data.
+                value = wp.neon_read(f_1, index, lattice_central_index)
+            return self.compute_dtype(value)
+
         if self.bc_type == "velocity":
             functional = functional_velocity
         elif self.bc_type == "pressure":
diff --git a/xlb/operator/boundary_condition/bc_zouhe.py b/xlb/operator/boundary_condition/bc_zouhe.py
index eb179496..3705f77c 100644
--- a/xlb/operator/boundary_condition/bc_zouhe.py
+++ b/xlb/operator/boundary_condition/bc_zouhe.py
@@ -93,7 +93,7 @@ def __init__(
             # a single non-zero number associated with pressure BC OR
             # a vector of zeros associated with no-slip BC.
             # Accounting for all scenarios here.
-            if self.compute_backend is ComputeBackend.WARP:
+            if self.compute_backend in [ComputeBackend.WARP, ComputeBackend.NEON]:
                 idx = np.nonzero(prescribed_value)[0]
                 prescribed_value = prescribed_value[idx][0] if idx.size else 0.0
                 prescribed_value = self.precision_policy.store_precision.wp_dtype(prescribed_value)
@@ -116,7 +116,7 @@ def _create_constant_prescribed_profile(self):
         _prescribed_value = self.prescribed_value
 
         @wp.func
-        def prescribed_profile_warp(index: wp.vec3i):
+        def prescribed_profile_warp(index: Any):
             return wp.vec(_prescribed_value, length=1)
 
         def prescribed_profile_jax():
@@ -126,6 +126,8 @@ def prescribed_profile_jax():
             return prescribed_profile_jax
         elif self.compute_backend == ComputeBackend.WARP:
             return prescribed_profile_warp
+        elif self.compute_backend == ComputeBackend.NEON:
+            return prescribed_profile_warp
 
     @partial(jit, static_argnums=(0,), inline=True)
     def _get_known_middle_mask(self, missing_mask):
@@ -274,6 +276,7 @@ def _construct_warp(self):
 
         # Set local constants
         _d = self.velocity_set.d
+        lattice_central_index = self.velocity_set.center_index
 
         @wp.func
         def functional_velocity(
@@ -298,7 +301,7 @@ def functional_velocity(
             # Find the value of u from the missing directions
             # Since we are only considering normal velocity, we only need to find one value (stored at the center of f_1)
             # Create velocity vector by multiplying the prescribed value with the normal vector
-            prescribed_value = f_1[0, index[0], index[1], index[2]]
+            prescribed_value = decode_lattice_center_value(index, f_1)
             _u = -prescribed_value * normals
 
             for d in range(_d):
@@ -329,7 +332,7 @@ def functional_pressure(
 
             # Find the value of rho from the missing directions
             # Since we need only one scalar value, we only need to find one value (stored at the center of f_1)
-            _rho = f_1[0, index[0], index[1], index[2]]
+            _rho = decode_lattice_center_value(index, f_1)
 
             # calculate velocity
             fsum = bc_helper.get_bc_fsum(_f, _missing_mask)
@@ -341,6 +344,18 @@ def functional_pressure(
             _f = bc_helper.bounceback_nonequilibrium(_f, feq, _missing_mask)
             return _f
 
+        @wp.func
+        def decode_lattice_center_value(index: Any, f_1: Any):
+            """
+            Decode the encoded values needed for the boundary condition treatment from the center location in f_1.
+            """
+            if wp.static(self.compute_backend == ComputeBackend.WARP):
+                value = f_1[lattice_central_index, index[0], index[1], index[2]]
+            else:
+                # Note: in Neon case, f_1 is a pointer to the field not the actual data.
+                value = wp.neon_read(f_1, index, lattice_central_index)
+            return self.compute_dtype(value)
+
         if self.bc_type == "velocity":
             functional = functional_velocity
         elif self.bc_type == "pressure":
diff --git a/xlb/operator/boundary_condition/boundary_condition.py b/xlb/operator/boundary_condition/boundary_condition.py
index 01e57d4d..991b6e3a 100644
--- a/xlb/operator/boundary_condition/boundary_condition.py
+++ b/xlb/operator/boundary_condition/boundary_condition.py
@@ -17,6 +17,7 @@
 from xlb import DefaultConfig
 from xlb.operator.boundary_condition.boundary_condition_registry import boundary_condition_registry
 from xlb.operator.boundary_condition import HelperFunctionsBC
+import neon
 
 
 # Enum for implementation step
@@ -160,6 +161,71 @@ def aux_data_init_kernel(
 
         return aux_data_init_kernel
 
+    def _construct_aux_data_init_container(self, functional):
+        """
+        Constructs the Neon container for encoding auxilary data recovery.
+        """
+
+        bc_helper = HelperFunctionsBC(velocity_set=self.velocity_set, precision_policy=self.precision_policy, compute_backend=self.compute_backend)
+
+        _id = wp.uint8(self.id)
+        _opp_indices = self.velocity_set.opp_indices
+        _num_of_aux_data = self.num_of_aux_data
+
+        # Find velocity index for 0, 0, 0
+        lattice_central_index = self.velocity_set.center_index
+
+        # Construct the Neon container
+        @neon.Container.factory(name="EncodingAuxData_" + str(self.id))
+        def aux_data_init_container(
+            f_0: Any,
+            f_1: Any,
+            bc_mask: Any,
+            missing_mask: Any,
+        ):
+            def aux_data_init_ll(loader: neon.Loader):
+                loader.set_grid(f_0.get_grid())
+
+                f_0_pn = loader.get_read_handle(f_0)
+                f_1_pn = loader.get_write_handle(f_1)
+                bc_mask_pn = loader.get_read_handle(bc_mask)
+                missing_mask_pn = loader.get_read_handle(missing_mask)
+
+                @wp.func
+                def aux_data_init_cl(index: Any):
+                    # read tid data
+                    _, _, _boundary_id, _missing_mask = bc_helper.neon_get_thread_data(f_0_pn, f_1_pn, bc_mask_pn, missing_mask_pn, index)
+
+                    # Apply the functional
+                    if _boundary_id == _id:
+                        # prescribed_values is a q-sized vector of type wp.vec
+                        prescribed_values = functional(index)
+
+                    # Write the result for all q directions, but only store up to num_of_aux_data
+                    counter = wp.int32(0)
+                    for l in range(self.velocity_set.q):
+                        # wp.neon_write(f_pn, index, l, self.store_dtype(feq[l]))
+                        if l == lattice_central_index:
+                            # The first BC auxiliary data is stored in the zero'th index of f_1 associated with its center.
+                            wp.neon_write(f_1_pn, index, l, self.store_dtype(prescribed_values[l]))
+                            counter += 1
+                        elif _missing_mask[l] == wp.uint8(1):
+                            # The other remaining BC auxiliary data are stored in missing directions of f_1.
+                            # Only store up to num_of_aux_data
+                            wp.neon_write(f_1_pn, index, _opp_indices[l], self.store_dtype(prescribed_values[l]))
+                            counter += 1
+                        if counter > _num_of_aux_data:
+                            # Only store up to num_of_aux_data
+                            return
+
+                # Declare the kernel in the Neon loader
+                loader.declare_kernel(aux_data_init_cl)
+
+            return aux_data_init_ll
+
+        return aux_data_init_container
+
+    # Initialize auxiliary data for the boundary condition.
     def aux_data_init(self, f_0, f_1, bc_mask, missing_mask):
         if self.compute_backend == ComputeBackend.WARP:
             # Launch the warp kernel
@@ -171,5 +237,85 @@ def aux_data_init(self, f_0, f_1, bc_mask, missing_mask):
         elif self.compute_backend == ComputeBackend.JAX:
             # We don't use boundary aux encoding/decoding in JAX
             self.prescribed_values = self.profile()
+        elif self.compute_backend == ComputeBackend.NEON:
+            c = self._construct_aux_data_init_container(self.profile)(f_0, f_1, bc_mask, missing_mask)
+            c.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
+        self.is_initialized_with_aux_data = True
+        return f_0, f_1
+
+    def _construct_multires_aux_data_init_container(self, functional):
+        """
+        Constructs the Neon container for encoding auxilary data recovery.
+        """
+
+        bc_helper = HelperFunctionsBC(velocity_set=self.velocity_set, precision_policy=self.precision_policy, compute_backend=self.compute_backend)
+
+        _id = wp.uint8(self.id)
+        _opp_indices = self.velocity_set.opp_indices
+        _num_of_aux_data = self.num_of_aux_data
+
+        # Find velocity index for 0, 0, 0
+        lattice_central_index = self.velocity_set.center_index
+
+        # Construct the Neon container
+        @neon.Container.factory(name="MultiresEncodingAuxData_" + str(self.id))
+        def aux_data_init_container(
+            f_0: Any,
+            f_1: Any,
+            bc_mask: Any,
+            missing_mask: Any,
+            level: Any,
+        ):
+            def aux_data_init_ll(loader: neon.Loader):
+                loader.set_mres_grid(f_0.get_grid(), level)
+
+                f_0_pn = loader.get_mres_read_handle(f_0)
+                f_1_pn = loader.get_mres_write_handle(f_1)
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask)
+
+                @wp.func
+                def aux_data_init_cl(index: Any):
+                    # read tid data
+                    _, _, _boundary_id, _missing_mask = bc_helper.neon_get_thread_data(f_0_pn, f_1_pn, bc_mask_pn, missing_mask_pn, index)
+
+                    # Apply the functional
+                    if _boundary_id == _id:
+                        # prescribed_values is a q-sized vector of type wp.vec
+                        prescribed_values = functional(index)
+
+                    # Write the result for all q directions, but only store up to num_of_aux_data
+                    counter = wp.int32(0)
+                    for l in range(self.velocity_set.q):
+                        # wp.neon_write(f_pn, index, l, self.store_dtype(feq[l]))
+                        if l == lattice_central_index:
+                            # The first BC auxiliary data is stored in the zero'th index of f_1 associated with its center.
+                            # TODO: add self.store_dtype
+                            wp.neon_write(f_1_pn, index, l, prescribed_values[l])
+                            counter += 1
+                        elif _missing_mask[l] == wp.uint8(1):
+                            # The other remaining BC auxiliary data are stored in missing directions of f_1.
+                            # Only store up to num_of_aux_data
+                            # TODO: add self.store_dtype
+                            wp.neon_write(f_1_pn, index, _opp_indices[l], prescribed_values[l])
+                            counter += 1
+                        if counter > _num_of_aux_data:
+                            # Only store up to num_of_aux_data
+                            return
+
+                # Declare the kernel in the Neon loader
+                loader.declare_kernel(aux_data_init_cl)
+
+            return aux_data_init_ll
+
+        return aux_data_init_container
+
+    # Initialize auxiliary data for the boundary condition.
+    def multires_aux_data_init(self, f_0, f_1, bc_mask, missing_mask, level, stream):
+        if self.compute_backend in [ComputeBackend.JAX, ComputeBackend.WARP]:
+            raise NotImplementedError(f"Operator {self.__class__.__name__} not supported in {self.compute_backend} backend.")
+        if self.compute_backend == ComputeBackend.NEON:
+            c = self._construct_multires_aux_data_init_container(self.profile)(f_0, f_1, bc_mask, missing_mask, level)
+            c.run(stream, container_runtime=neon.Container.ContainerRuntime.neon)
         self.is_initialized_with_aux_data = True
         return f_0, f_1
diff --git a/xlb/operator/boundary_condition/helper_functions_bc.py b/xlb/operator/boundary_condition/helper_functions_bc.py
index 269a9f3a..c4f1ddfe 100644
--- a/xlb/operator/boundary_condition/helper_functions_bc.py
+++ b/xlb/operator/boundary_condition/helper_functions_bc.py
@@ -58,6 +58,27 @@ def get_thread_data(
                     _missing_mask[l] = wp.uint8(0)
             return _f_pre, _f_post, _boundary_id, _missing_mask
 
+        @wp.func
+        def neon_get_thread_data(
+            f_pre_pn: Any,
+            f_post_pn: Any,
+            bc_mask_pn: Any,
+            missing_mask_pn: Any,
+            index: Any,
+        ):
+            # Get the boundary id and missing mask
+            _f_pre = _f_vec()
+            _f_post = _f_vec()
+            _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+            _missing_mask = _missing_mask_vec()
+            for l in range(_q):
+                # q-sized vector of populations
+                _f_pre[l] = compute_dtype(wp.neon_read(f_pre_pn, index, l))
+                _f_post[l] = compute_dtype(wp.neon_read(f_post_pn, index, l))
+                _missing_mask[l] = wp.neon_read(missing_mask_pn, index, l)
+
+            return _f_pre, _f_post, _boundary_id, _missing_mask
+
         @wp.func
         def get_bc_fsum(
             fpop: Any,
@@ -126,3 +147,4 @@ def regularize_fpop(
         self.get_normal_vectors = get_normal_vectors
         self.bounceback_nonequilibrium = bounceback_nonequilibrium
         self.regularize_fpop = regularize_fpop
+        self.neon_get_thread_data = neon_get_thread_data
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index b67282a9..270c3b8d 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -243,7 +243,9 @@ def _initialize_auxiliary_data(boundary_conditions, f_0, f_1, bc_mask, missing_m
         """Initialize auxiliary data for boundary conditions that require it."""
         for bc in boundary_conditions:
             if bc.needs_aux_init and not bc.is_initialized_with_aux_data:
-                f_0, f_1 = bc.aux_data_init(f_0, f_1, bc_mask, missing_mask)
+                for level in range(bc_mask.get_grid().get_num_levels()):
+                    # Initialize auxiliary data for each level
+                    f_0, f_1 = bc.multires_aux_data_init(f_0, f_1, bc_mask, missing_mask, level=level, stream=0)
         return f_0, f_1
 
     def _construct_neon(self):
@@ -251,6 +253,7 @@ def _construct_neon(self):
         lattice_central_index = self.velocity_set.center_index
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
         _missing_mask_vec = wp.vec(self.velocity_set.q, dtype=wp.uint8)
+        _opp_indices = self.velocity_set.opp_indices
 
         # Read the list of bc_to_id created upon instantiation
         bc_to_id = boundary_condition_registry.bc_to_id
@@ -311,7 +314,34 @@ def neon_get_thread_data(
 
             return _f0_thread, _f1_thread, _missing_mask
 
-        import typing
+        @wp.func
+        def neon_apply_aux_recovery_bc(
+            index: Any,
+            _boundary_id: Any,
+            _missing_mask: Any,
+            f_0_pn: Any,
+            _f1_thread: Any,
+        ):
+            # Note:
+            # In XLB, the BC auxiliary data (e.g. prescribed values of pressure or normal velocity) are stored in (i) central index of f_1 and/or
+            # (ii) missing directions of f_1. Some BCs may or may not need all these available storage space. This function checks whether
+            # the BC needs recovery of auxiliary data and then recovers the information for the next iteration (due to buffer swapping) by
+            # writting the thread values of f_1 (i.e._f1_thread) into f_0.
+
+            # Unroll the loop over boundary conditions
+            for i in range(wp.static(len(self.boundary_conditions))):
+                if wp.static(self.boundary_conditions[i].needs_aux_recovery):
+                    if _boundary_id == wp.static(self.boundary_conditions[i].id):
+                        for l in range(self.velocity_set.q):
+                            # Perform the swapping of data
+                            if l == lattice_central_index:
+                                # (i) Recover the values stored in the central index of f_1
+                                # TODO: Add store dtype
+                                wp.neon_write(f_0_pn, index, l, _f1_thread[l])
+                            elif _missing_mask[l] == wp.uint8(1):
+                                # (ii) Recover the values stored in the missing directions of f_1
+                                # TODO: Add store dtype
+                                wp.neon_write(f_0_pn, index, _opp_indices[l], _f1_thread[_opp_indices[l]])
 
         @neon.Container.factory(name="collide_coarse")
         def collide_coarse(
@@ -368,6 +398,7 @@ def device(index: Any):
                             index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False
                         )
 
+                        # Accumulate the post-collision populations in f_0
                         for l in range(self.velocity_set.q):
                             push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
                             if level < num_levels - 1:
@@ -375,6 +406,9 @@ def device(index: Any):
                                 wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, val)
                                 # Verified that this is not needed: wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, val)
 
+                            # Apply auxiliary recovery for boundary conditions (swapping) before overwriting f_1
+                            neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, _f1_thread)
+
                             wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
                     else:
                         for l in range(self.velocity_set.q):
@@ -491,6 +525,9 @@ def cl_stream_coarse(index: Any):
                     # do non mres post-streaming corrections
                     _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
 
+                    # Apply auxiliary recovery for boundary conditions (swapping) before overwriting f_1
+                    neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, _f1_thread)
+
                     for l in range(self.velocity_set.q):
                         wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
 

From cc3ffeadfa522c0407bce84f8c2f99a57b1573a6 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 27 May 2025 11:46:51 -0400
Subject: [PATCH 053/208] addressed PR review

---
 examples/performance/mlups_3d.py              |   2 +
 examples/performance/mlups_3d_multires.py     |  16 +-
 .../mlups_3d_multires_single_level.py         | 197 ------------------
 examples/performance/mlups_3d_neon.py         | 161 --------------
 .../multires_boundary_masker.py               |   2 +-
 5 files changed, 9 insertions(+), 369 deletions(-)
 delete mode 100644 examples/performance/mlups_3d_multires_single_level.py
 delete mode 100644 examples/performance/mlups_3d_neon.py

diff --git a/examples/performance/mlups_3d.py b/examples/performance/mlups_3d.py
index eca222df..1a2ebdae 100644
--- a/examples/performance/mlups_3d.py
+++ b/examples/performance/mlups_3d.py
@@ -29,6 +29,8 @@ def setup_simulation(args):
         compute_backend = ComputeBackend.WARP
     elif args.compute_backend == "neon":
         compute_backend = ComputeBackend.NEON
+    else:
+        raise ValueError("Invalid compute backend specified. Use 'jax', 'warp', or 'neon'.")
     precision_policy_map = {
         "fp32/fp32": PrecisionPolicy.FP32FP32,
         "fp64/fp64": PrecisionPolicy.FP64FP64,
diff --git a/examples/performance/mlups_3d_multires.py b/examples/performance/mlups_3d_multires.py
index 14ef7819..54cf3865 100644
--- a/examples/performance/mlups_3d_multires.py
+++ b/examples/performance/mlups_3d_multires.py
@@ -29,14 +29,10 @@ def parse_arguments():
 
 def setup_simulation(args):
     compute_backend = None
-    if args.compute_backend == "jax":
-        compute_backend = ComputeBackend.JAX
-    elif args.compute_backend == "warp":
-        compute_backend = ComputeBackend.WARP
-    elif args.compute_backend == "neon":
+    if args.compute_backend == "neon":
         compute_backend = ComputeBackend.NEON
-    if compute_backend is None:
-        raise ValueError("Invalid backend")
+    else:
+        raise ValueError("Invalid compute backend specified. Use 'neon' which supports multi-resolution!")
 
     precision_policy_map = {
         "fp32/fp32": PrecisionPolicy.FP32FP32,
@@ -219,9 +215,9 @@ def run(velocity_set, grid_shape, num_steps):
     start_time = time.time()
     for i in range(num_steps):
         sim.step()
-        if i % 1000 == 0:
-            print(f"step {i}")
-            sim.export_macroscopic("u_lid_driven_cavity_")
+        # if i % 1000 == 0:
+        #     print(f"step {i}")
+        #     sim.export_macroscopic("u_lid_driven_cavity_")
     wp.synchronize()
     t = time.time() - start_time
     print(f"Timing  {t}")
diff --git a/examples/performance/mlups_3d_multires_single_level.py b/examples/performance/mlups_3d_multires_single_level.py
deleted file mode 100644
index 4bbeb0a5..00000000
--- a/examples/performance/mlups_3d_multires_single_level.py
+++ /dev/null
@@ -1,197 +0,0 @@
-import xlb
-import argparse
-import time
-import warp as wp
-import numpy as np
-
-# add a directory to the PYTHON PATH
-import sys
-
-# sys.path.append('/home/max/repos/neon/warping/neon_warp_testing/neon_py_bindings/py/')
-import neon
-
-from xlb.compute_backend import ComputeBackend
-from xlb.precision_policy import PrecisionPolicy
-from xlb.grid import multires_grid_factory
-from xlb.operator.stepper import MultiresIncompressibleNavierStokesStepper
-from xlb.operator.boundary_condition import FullwayBounceBackBC, EquilibriumBC
-from xlb.distribute import distribute
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation (BGK)")
-    # Positional arguments
-    parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid")
-    parser.add_argument("num_steps", type=int, help="Timestep for the simulation")
-    parser.add_argument("backend", type=str, help="Backend for the simulation (jax, warp or neon)")
-    parser.add_argument("precision", type=str, help="Precision for the simulation (e.g., fp32/fp32)")
-
-    # Optional arguments
-    parser.add_argument("--num_devices", type=int, default=0, help="Number of devices for the simulation (default: 0)")
-    parser.add_argument("--velocity_set", type=str, default="D3Q19", help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)")
-
-    return parser.parse_args()
-
-
-def setup_simulation(args):
-    backend = None
-    if args.backend == "jax":
-        backend = ComputeBackend.JAX
-    elif args.backend == "warp":
-        backend = ComputeBackend.WARP
-    elif args.backend == "neon":
-        backend = ComputeBackend.NEON
-    if backend is None:
-        raise ValueError("Invalid backend")
-
-    precision_policy_map = {
-        "fp32/fp32": PrecisionPolicy.FP32FP32,
-        "fp64/fp64": PrecisionPolicy.FP64FP64,
-        "fp64/fp32": PrecisionPolicy.FP64FP32,
-        "fp32/fp16": PrecisionPolicy.FP32FP16,
-    }
-    precision_policy = precision_policy_map.get(args.precision)
-    if precision_policy is None:
-        raise ValueError("Invalid precision")
-
-    velocity_set = None
-    if args.velocity_set == "D3Q19":
-        velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
-    elif args.velocity_set == "D3Q27":
-        velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, backend=backend)
-    if velocity_set is None:
-        raise ValueError("Invalid velocity set")
-
-    xlb.init(
-        velocity_set=velocity_set,
-        default_backend=backend,
-        default_precision_policy=precision_policy,
-    )
-
-    return backend, precision_policy
-
-
-def run(backend, precision_policy, grid_shape, num_steps):
-    # Create grid and setup boundary conditions
-    velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
-
-    def peel(dim, idx, peel_level, outwards):
-        if outwards:
-            xIn = idx.x <= peel_level or idx.x >= dim.x - 1 - peel_level
-            yIn = idx.y <= peel_level or idx.y >= dim.y - 1 - peel_level
-            zIn = idx.z <= peel_level or idx.z >= dim.z - 1 - peel_level
-            return xIn or yIn or zIn
-        else:
-            xIn = idx.x >= peel_level and idx.x <= dim.x - 1 - peel_level
-            yIn = idx.y >= peel_level and idx.y <= dim.y - 1 - peel_level
-            zIn = idx.z >= peel_level and idx.z <= dim.z - 1 - peel_level
-            return xIn and yIn and zIn
-
-    dim = neon.Index_3d(grid_shape[0], grid_shape[1], grid_shape[2])
-    level_zero_mask = np.ones((dim.x, dim.y, dim.z), dtype=int)
-    level_zero_mask = np.ascontiguousarray(level_zero_mask, dtype=np.int32)
-
-    grid = multires_grid_factory(
-        grid_shape,
-        velocity_set=velocity_set,
-        sparsity_pattern_list=[
-            level_zero_mask,
-        ],
-        sparsity_pattern_origins=[
-            neon.Index_3d(0, 0, 0),
-        ],
-    )
-
-    box = grid.bounding_box_indices()
-    box_no_edge = grid.bounding_box_indices(remove_edges=True)
-    lid = box_no_edge["top"]
-    walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in range(len(grid.shape))]
-    walls = np.unique(np.array(walls), axis=-1).tolist()
-
-    prescribed_vel = 0.05
-
-    boundary_conditions = [
-        EquilibriumBC(rho=1.0, u=(prescribed_vel, 0.0, 0.0), indices=lid),
-        EquilibriumBC(rho=1.0, u=(0.0, 0.0, 0.0), indices=walls),
-    ]
-
-    # Create stepper
-    stepper = MultiresIncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="BGK")
-
-    Re = 100.0
-    clength = grid_shape[0] - 1
-    visc = prescribed_vel * clength / Re
-    omega = 1.0 / (3.0 * visc + 0.5)
-
-    # # Initialize fields and run simulation
-    # omega = 1.0
-
-    sim = xlb.helper.MultiresSimulationManager(grid, velocity_set, stepper, omega)
-
-    sim.export_macroscopic("Initial_")
-
-    print("start timing")
-    start_time = time.time()
-
-    for i in range(num_steps):
-        print(f"step {i}")
-        sim.step()
-        if i % 1 == 0:
-            sim.export_macroscopic("u_lid_driven_cavity_")
-    wp.synchronize()
-    t = time.time() - start_time
-
-    sim.export_macroscopic("u_lid_driven_cavity_")
-    return t
-
-
-def calculate_mlups(cube_edge, num_steps, elapsed_time):
-    total_lattice_updates = cube_edge**3 * num_steps
-    mlups = (total_lattice_updates / elapsed_time) / 1e6
-    return mlups
-
-
-def post_process(macro, rho, u, f_0, i):
-    # Write the results. We'll use JAX backend for the post-processing
-    # import jax.numpy as jnp
-    # if not isinstance(f_0, jnp.ndarray):
-    #     # If the backend is warp, we need to drop the last dimension added by warp for 2D simulations
-    #     f_0 = wp.to_jax(f_0)[..., 0]
-    # else:
-    #     f_0 = f_0
-    rho, u = macro(f_0, rho, u)
-    wp.synchronize()
-    u.update_host(0)
-    rho.update_host(0)
-    wp.synchronize()
-    u.export_vti(f"u_lid_driven_cavity_{i}.vti", "u")
-    rho.export_vti(f"rho_lid_driven_cavity_{i}.vti", "rho")
-
-    pass
-
-    # # remove boundary cells
-    # rho = rho[:, 1:-1, 1:-1, 1:-1]
-    # u = u[:, 1:-1, 1:-1, 1:-1]
-    # u_magnitude = (u[0] ** 2 + u[1] ** 2) ** 0.5
-    #
-    # fields = {"rho": rho[0], "u_x": u[0], "u_y": u[1], "u_magnitude": u_magnitude}
-    #
-    # # save_fields_vtk(fields, timestep=i, prefix="lid_driven_cavity")
-    # ny=fields["u_magnitude"].shape[1]
-    # from xlb.utils import  save_image
-    # save_image(fields["u_magnitude"][:, ny//2, :], timestep=i, prefix="lid_driven_cavity")
-
-
-def main():
-    args = parse_arguments()
-    backend, precision_policy = setup_simulation(args)
-    grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
-    elapsed_time = run(backend, precision_policy, grid_shape, args.num_steps)
-    mlups = calculate_mlups(args.cube_edge, args.num_steps, elapsed_time)
-
-    print(f"Simulation completed in {elapsed_time:.2f} seconds")
-    print(f"MLUPs: {mlups:.2f}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/performance/mlups_3d_neon.py b/examples/performance/mlups_3d_neon.py
deleted file mode 100644
index 932c1567..00000000
--- a/examples/performance/mlups_3d_neon.py
+++ /dev/null
@@ -1,161 +0,0 @@
-from warp.examples.fem.example_convection_diffusion import velocity
-
-import xlb
-import argparse
-import time
-import warp as wp
-import numpy as np
-
-from xlb.compute_backend import ComputeBackend
-from xlb.precision_policy import PrecisionPolicy
-from xlb.grid import grid_factory
-from xlb.operator.stepper import IncompressibleNavierStokesStepper
-from xlb.operator.boundary_condition import FullwayBounceBackBC, EquilibriumBC
-from xlb.distribute import distribute
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation (BGK)")
-    # Positional arguments
-    parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid")
-    parser.add_argument("num_steps", type=int, help="Timestep for the simulation")
-    parser.add_argument("compute_backend", type=str, help="Backend for the simulation (jax, warp or neon)")
-    parser.add_argument("precision", type=str, help="Precision for the simulation (e.g., fp32/fp32)")
-
-    # Optional arguments
-    parser.add_argument("--num_devices", type=int, default=0, help="Number of devices for the simulation (default: 0)")
-    parser.add_argument("--velocity_set", type=str, default="D3Q19", help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)")
-
-    return parser.parse_args()
-
-
-def setup_simulation(args):
-    compute_backend = None
-    if args.compute_backend == "jax":
-        compute_backend = ComputeBackend.JAX
-    elif args.compute_backend == "warp":
-        compute_backend = ComputeBackend.WARP
-    elif args.compute_backend == "neon":
-        compute_backend = ComputeBackend.NEON
-    if compute_backend is None:
-        raise ValueError("Invalid backend")
-
-    precision_policy_map = {
-        "fp32/fp32": PrecisionPolicy.FP32FP32,
-        "fp64/fp64": PrecisionPolicy.FP64FP64,
-        "fp64/fp32": PrecisionPolicy.FP64FP32,
-        "fp32/fp16": PrecisionPolicy.FP32FP16,
-    }
-    precision_policy = precision_policy_map.get(args.precision)
-    if precision_policy is None:
-        raise ValueError("Invalid precision")
-
-    velocity_set = None
-    if args.velocity_set == "D3Q19":
-        velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=compute_backend)
-    elif args.velocity_set == "D3Q27":
-        velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=compute_backend)
-    if velocity_set is None:
-        raise ValueError("Invalid velocity set")
-
-    xlb.init(
-        velocity_set=velocity_set,
-        default_backend=compute_backend,
-        default_precision_policy=precision_policy,
-    )
-
-    return compute_backend, precision_policy
-
-
-def run(macro, compute_backend, precision_policy, grid_shape, num_steps):
-    # Create grid and setup boundary conditions
-    velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=compute_backend)
-    grid = grid_factory(grid_shape, velocity_set=velocity_set)
-    box = grid.bounding_box_indices()
-    box_no_edge = grid.bounding_box_indices(remove_edges=True)
-    lid = box_no_edge["top"]
-    walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in range(len(grid.shape))]
-    walls = np.unique(np.array(walls), axis=-1).tolist()
-
-    boundary_conditions = [EquilibriumBC(rho=1.0, u=(0.02, 0.0, 0.0), indices=lid), FullwayBounceBackBC(indices=walls)]
-
-    # Create stepper
-    stepper = IncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="BGK")
-
-    # Initialize fields and run simulation
-    omega = 1.0
-    f_0, f_1, bc_mask, missing_mask = stepper.prepare_fields()
-    rho = stepper.grid.create_field(1, dtype=precision_policy.store_precision)
-    u = stepper.grid.create_field(3, dtype=precision_policy.store_precision)
-
-    start_time = time.time()
-
-    for i in range(num_steps):
-        f_0, f_1 = stepper(f_0, f_1, bc_mask, missing_mask, omega, 0)
-        f_0, f_1 = f_1, f_0
-
-        # if i % 2 == 0 or i == num_steps - 1:
-        wp.synchronize()
-        post_process(macro, rho, u, f_0, i)
-    wp.synchronize()
-
-    return time.time() - start_time
-
-
-def calculate_mlups(cube_edge, num_steps, elapsed_time):
-    total_lattice_updates = cube_edge**3 * num_steps
-    mlups = (total_lattice_updates / elapsed_time) / 1e6
-    return mlups
-
-
-def post_process(macro, rho, u, f_0, i):
-    # Write the results. We'll use JAX backend for the post-processing
-    # import jax.numpy as jnp
-    # if not isinstance(f_0, jnp.ndarray):
-    #     # If the backend is warp, we need to drop the last dimension added by warp for 2D simulations
-    #     f_0 = wp.to_jax(f_0)[..., 0]
-    # else:
-    #     f_0 = f_0
-    rho, u = macro(f_0, rho, u)
-    wp.synchronize()
-    u.update_host(0)
-    rho.update_host(0)
-    wp.synchronize()
-    u.export_vti(f"u_lid_driven_cavity_{i}.vti", "u")
-    rho.export_vti(f"rho_lid_driven_cavity_{i}.vti", "rho")
-
-    pass
-
-    # # remove boundary cells
-    # rho = rho[:, 1:-1, 1:-1, 1:-1]
-    # u = u[:, 1:-1, 1:-1, 1:-1]
-    # u_magnitude = (u[0] ** 2 + u[1] ** 2) ** 0.5
-    #
-    # fields = {"rho": rho[0], "u_x": u[0], "u_y": u[1], "u_magnitude": u_magnitude}
-    #
-    # # save_fields_vtk(fields, timestep=i, prefix="lid_driven_cavity")
-    # ny=fields["u_magnitude"].shape[1]
-    # from xlb.utils import  save_image
-    # save_image(fields["u_magnitude"][:, ny//2, :], timestep=i, prefix="lid_driven_cavity")
-
-
-def main():
-    args = parse_arguments()
-    compute_backend, precision_policy = setup_simulation(args)
-    grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
-    from xlb.operator.macroscopic import Macroscopic
-
-    macro = Macroscopic(
-        compute_backend=ComputeBackend.NEON,
-        precision_policy=precision_policy,
-        velocity_set=xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=ComputeBackend.NEON),
-    )
-    elapsed_time = run(macro, compute_backend, precision_policy, grid_shape, args.num_steps)
-    mlups = calculate_mlups(args.cube_edge, args.num_steps, elapsed_time)
-
-    print(f"Simulation completed in {elapsed_time:.2f} seconds")
-    print(f"MLUPs: {mlups:.2f}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/xlb/operator/boundary_masker/multires_boundary_masker.py b/xlb/operator/boundary_masker/multires_boundary_masker.py
index 1b197143..4ada5360 100644
--- a/xlb/operator/boundary_masker/multires_boundary_masker.py
+++ b/xlb/operator/boundary_masker/multires_boundary_masker.py
@@ -63,7 +63,7 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None, x
             # call indices masker for this level
             bc_mask_warp, missing_mask_warp = self.indices_masker(bclist_level, bc_mask_warp, missing_mask_warp, start_index, xlb_grid)
 
-            @neon.Container.factory("")
+            @neon.Container.factory(name="MultiresBoundaryMasker")
             def container(
                 bc_mask_warp: typing.Any,
                 missing_mask_warp: typing.Any,

From 30d4fc1642656dd7150bc931dbbdf7b5386222d7 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 27 May 2025 14:59:40 -0400
Subject: [PATCH 054/208] Added capability to handle BC profiles in MRES

---
 .../grid_refinement/flow_past_sphere_3d.py    | 86 +++++--------------
 xlb/operator/boundary_condition/bc_zouhe.py   |  2 +-
 .../boundary_condition/boundary_condition.py  | 10 ++-
 .../multires_boundary_masker.py               |  1 -
 4 files changed, 31 insertions(+), 68 deletions(-)

diff --git a/examples/cfd/grid_refinement/flow_past_sphere_3d.py b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
index 9a1e891b..d0f59ed4 100644
--- a/examples/cfd/grid_refinement/flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
@@ -75,77 +75,33 @@
 
 # Define Boundary Conditions
 def bc_profile():
-    H_y = float(ny - 1)  # Height in y direction
-    H_z = float(nz - 1)  # Height in z direction
-
-    if compute_backend == ComputeBackend.JAX:
-
-        def bc_profile_jax():
-            y = jnp.arange(ny)
-            z = jnp.arange(nz)
-            Y, Z = jnp.meshgrid(y, z, indexing="ij")
-
-            # Calculate normalized distance from center
-            y_center = Y - (H_y / 2.0)
-            z_center = Z - (H_z / 2.0)
-            r_squared = (2.0 * y_center / H_y) ** 2.0 + (2.0 * z_center / H_z) ** 2.0
-
-            # Parabolic profile for x velocity, zero for y and z
-            u_x = u_max * jnp.maximum(0.0, 1.0 - r_squared)
-            u_y = jnp.zeros_like(u_x)
-            u_z = jnp.zeros_like(u_x)
-
-            return jnp.stack([u_x, u_y, u_z])
-
-        return bc_profile_jax
-
-    elif compute_backend == ComputeBackend.WARP:
-
-        @wp.func
-        def bc_profile_warp(index: wp.vec3i):
-            # Poiseuille flow profile: parabolic velocity distribution
-            y = wp.float32(index[1])
-            z = wp.float32(index[2])
-
-            # Calculate normalized distance from center
-            y_center = y - (H_y / 2.0)
-            z_center = z - (H_z / 2.0)
-            r_squared = (2.0 * y_center / H_y) ** 2.0 + (2.0 * z_center / H_z) ** 2.0
-
-            # Parabolic profile: u = u_max * (1 - r²)
-            return wp.vec(u_max * wp.max(0.0, 1.0 - r_squared), length=1)
-
-    elif compute_backend == ComputeBackend.NEON:
-        raise NotImplementedError("BC profile not implemented yet!")
-        # wp.func
-        # def bc_profile_warp(index: Any):
-        #     # Get the refinement level for the current index
-        #     refinement = 2 ** grid.get_level(index)
-        #     cIdx = wp.neon_global_idx(bc_mask_hdl, index)
-        #     # get local indices by dividing the global indices (associated with the finest level) by 2^level
-        #     lx = wp.neon_get_x(cIdx) // refinement
-        #     ly = wp.neon_get_y(cIdx) // refinement
-        #     lz = wp.neon_get_z(cIdx) // refinement
-
-        #     # Poiseuille flow profile: parabolic velocity distribution
-        #     y = wp.float32(index[1])
-        #     z = wp.float32(index[2])
-
-        #     # Calculate normalized distance from center
-        #     y_center = y - (H_y / 2.0)
-        #     z_center = z - (H_z / 2.0)
-        #     r_squared = (2.0 * y_center / H_y) ** 2.0 + (2.0 * z_center / H_z) ** 2.0
-
-        #     # Parabolic profile: u = u_max * (1 - r²)
-        #     return wp.vec(u_max * wp.max(0.0, 1.0 - r_squared), length=1)
+    assert compute_backend == ComputeBackend.NEON
+
+    # Note nx, ny, nz are the dimensions of the grid at the finest level
+    H_y = float(ny // 2 - 1)  # Height in y direction
+    H_z = float(nz // 2 - 1)  # Height in z direction
+
+    @wp.func
+    def bc_profile_warp(index: wp.vec3i):
+        # Poiseuille flow profile: parabolic velocity distribution
+        y = wp.float32(index[1])
+        z = wp.float32(index[2])
+
+        # Calculate normalized distance from center
+        y_center = y - (H_y / 2.0)
+        z_center = z - (H_z / 2.0)
+        r_squared = (2.0 * y_center / H_y) ** 2.0 + (2.0 * z_center / H_z) ** 2.0
+
+        # Parabolic profile: u = u_max * (1 - r²)
+        return wp.vec(u_max * wp.max(0.0, 1.0 - r_squared), length=1)
 
     return bc_profile_warp
 
 
 # Initialize Boundary Conditions
-# bc_left = RegularizedBC("velocity", profile=bc_profile(), indices=inlet)
+bc_left = RegularizedBC("velocity", profile=bc_profile(), indices=inlet)
 # Alternatively, use a prescribed velocity profile
-bc_left = RegularizedBC("velocity", prescribed_value=(u_max, 0.0, 0.0), indices=inlet)
+# bc_left = RegularizedBC("velocity", prescribed_value=(u_max, 0.0, 0.0), indices=inlet)
 bc_walls = FullwayBounceBackBC(indices=walls)  # TODO: issues with halfway bounce back only here!
 # bc_outlet = ExtrapolationOutflowBC(indices=outlet)
 bc_outlet = DoNothingBC(indices=outlet)
diff --git a/xlb/operator/boundary_condition/bc_zouhe.py b/xlb/operator/boundary_condition/bc_zouhe.py
index 3705f77c..58148a38 100644
--- a/xlb/operator/boundary_condition/bc_zouhe.py
+++ b/xlb/operator/boundary_condition/bc_zouhe.py
@@ -116,7 +116,7 @@ def _create_constant_prescribed_profile(self):
         _prescribed_value = self.prescribed_value
 
         @wp.func
-        def prescribed_profile_warp(index: Any):
+        def prescribed_profile_warp(index: wp.vec3i):
             return wp.vec(_prescribed_value, length=1)
 
         def prescribed_profile_jax():
diff --git a/xlb/operator/boundary_condition/boundary_condition.py b/xlb/operator/boundary_condition/boundary_condition.py
index 991b6e3a..34aa70f1 100644
--- a/xlb/operator/boundary_condition/boundary_condition.py
+++ b/xlb/operator/boundary_condition/boundary_condition.py
@@ -274,6 +274,9 @@ def aux_data_init_ll(loader: neon.Loader):
                 bc_mask_pn = loader.get_mres_read_handle(bc_mask)
                 missing_mask_pn = loader.get_mres_read_handle(missing_mask)
 
+                # Get the refinement factor for the current level
+                refinement = 2**level
+
                 @wp.func
                 def aux_data_init_cl(index: Any):
                     # read tid data
@@ -282,7 +285,12 @@ def aux_data_init_cl(index: Any):
                     # Apply the functional
                     if _boundary_id == _id:
                         # prescribed_values is a q-sized vector of type wp.vec
-                        prescribed_values = functional(index)
+                        warp_index = wp.vec3i()
+                        gloabl_index = wp.neon_global_idx(f_0_pn, index)
+                        warp_index[0] = wp.neon_get_x(gloabl_index) // refinement
+                        warp_index[1] = wp.neon_get_y(gloabl_index) // refinement
+                        warp_index[2] = wp.neon_get_z(gloabl_index) // refinement
+                        prescribed_values = functional(warp_index)
 
                     # Write the result for all q directions, but only store up to num_of_aux_data
                     counter = wp.int32(0)
diff --git a/xlb/operator/boundary_masker/multires_boundary_masker.py b/xlb/operator/boundary_masker/multires_boundary_masker.py
index 4ada5360..f9759f08 100644
--- a/xlb/operator/boundary_masker/multires_boundary_masker.py
+++ b/xlb/operator/boundary_masker/multires_boundary_masker.py
@@ -42,7 +42,6 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None, x
         assert bc_mask.get_grid().get_name() == "mGrid", f"Operation {self.__class__.__name} is only applicable to multi-resolution cases"
 
         # number of levels
-        # indices_per_level = []
         num_levels = bc_mask.get_grid().get_num_levels()
         for level in range(num_levels):
             # Use the warp backend to create dense fields to be written in multi-res NEON fields

From b2870827a241dfecdf9e948cf11cb038f7ad586e Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 27 May 2025 18:14:39 -0400
Subject: [PATCH 055/208] Reduced register pressure by avoiding reading
 f1_thread for all cells

---
 .../boundary_condition/boundary_condition.py  |  8 ++--
 .../boundary_condition/helper_functions_bc.py |  8 ++--
 xlb/operator/stepper/nse_multires_stepper.py  | 34 +++++++--------
 xlb/operator/stepper/nse_stepper.py           | 43 ++++++++-----------
 4 files changed, 44 insertions(+), 49 deletions(-)

diff --git a/xlb/operator/boundary_condition/boundary_condition.py b/xlb/operator/boundary_condition/boundary_condition.py
index 34aa70f1..bbc8e8f3 100644
--- a/xlb/operator/boundary_condition/boundary_condition.py
+++ b/xlb/operator/boundary_condition/boundary_condition.py
@@ -102,7 +102,7 @@ def kernel(
             index = wp.vec3i(i, j, k)
 
             # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = bc_helper.get_thread_data(f_pre, f_post, bc_mask, missing_mask, index)
+            _f_pre, _f_post, _boundary_id, _missing_mask = bc_helper.get_bc_thread_data(f_pre, f_post, bc_mask, missing_mask, index)
 
             # Apply the boundary condition
             if _boundary_id == _id:
@@ -140,7 +140,7 @@ def aux_data_init_kernel(
             index = wp.vec3i(i, j, k)
 
             # read tid data
-            _f_0, _f_1, _boundary_id, _missing_mask = bc_helper.get_thread_data(f_0, f_1, bc_mask, missing_mask, index)
+            _, _, _boundary_id, _missing_mask = bc_helper.get_bc_thread_data(f_0, f_1, bc_mask, missing_mask, index)
 
             # Apply the functional
             if _boundary_id == _id:
@@ -194,7 +194,7 @@ def aux_data_init_ll(loader: neon.Loader):
                 @wp.func
                 def aux_data_init_cl(index: Any):
                     # read tid data
-                    _, _, _boundary_id, _missing_mask = bc_helper.neon_get_thread_data(f_0_pn, f_1_pn, bc_mask_pn, missing_mask_pn, index)
+                    _, _, _boundary_id, _missing_mask = bc_helper.neon_get_bc_thread_data(f_0_pn, f_1_pn, bc_mask_pn, missing_mask_pn, index)
 
                     # Apply the functional
                     if _boundary_id == _id:
@@ -280,7 +280,7 @@ def aux_data_init_ll(loader: neon.Loader):
                 @wp.func
                 def aux_data_init_cl(index: Any):
                     # read tid data
-                    _, _, _boundary_id, _missing_mask = bc_helper.neon_get_thread_data(f_0_pn, f_1_pn, bc_mask_pn, missing_mask_pn, index)
+                    _, _, _boundary_id, _missing_mask = bc_helper.neon_get_bc_thread_data(f_0_pn, f_1_pn, bc_mask_pn, missing_mask_pn, index)
 
                     # Apply the functional
                     if _boundary_id == _id:
diff --git a/xlb/operator/boundary_condition/helper_functions_bc.py b/xlb/operator/boundary_condition/helper_functions_bc.py
index c4f1ddfe..b3e134e6 100644
--- a/xlb/operator/boundary_condition/helper_functions_bc.py
+++ b/xlb/operator/boundary_condition/helper_functions_bc.py
@@ -34,7 +34,7 @@ def __init__(self, velocity_set=None, precision_policy=None, compute_backend=Non
         momentum_flux = MomentumFlux(velocity_set, precision_policy, compute_backend)
 
         @wp.func
-        def get_thread_data(
+        def get_bc_thread_data(
             f_pre: wp.array4d(dtype=Any),
             f_post: wp.array4d(dtype=Any),
             bc_mask: wp.array4d(dtype=wp.uint8),
@@ -59,7 +59,7 @@ def get_thread_data(
             return _f_pre, _f_post, _boundary_id, _missing_mask
 
         @wp.func
-        def neon_get_thread_data(
+        def neon_get_bc_thread_data(
             f_pre_pn: Any,
             f_post_pn: Any,
             bc_mask_pn: Any,
@@ -142,9 +142,9 @@ def regularize_fpop(
                 fpop[l] = feq[l] + fpop1
             return fpop
 
-        self.get_thread_data = get_thread_data
+        self.get_bc_thread_data = get_bc_thread_data
         self.get_bc_fsum = get_bc_fsum
         self.get_normal_vectors = get_normal_vectors
         self.bounceback_nonequilibrium = bounceback_nonequilibrium
         self.regularize_fpop = regularize_fpop
-        self.neon_get_thread_data = neon_get_thread_data
+        self.neon_get_bc_thread_data = neon_get_bc_thread_data
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 270c3b8d..38441c7a 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -298,21 +298,18 @@ def apply_bc(
         @wp.func
         def neon_get_thread_data(
             f0_pn: Any,
-            f1_pn: Any,
             missing_mask_pn: Any,
             index: Any,
         ):
             # Read thread data for populations
             _f0_thread = _f_vec()
-            _f1_thread = _f_vec()
             _missing_mask = _missing_mask_vec()
             for l in range(self.velocity_set.q):
                 # q-sized vector of pre-streaming populations
                 _f0_thread[l] = self.compute_dtype(wp.neon_read(f0_pn, index, l))
-                _f1_thread[l] = self.compute_dtype(wp.neon_read(f1_pn, index, l))
                 _missing_mask[l] = wp.neon_read(missing_mask_pn, index, l)
 
-            return _f0_thread, _f1_thread, _missing_mask
+            return _f0_thread, _missing_mask
 
         @wp.func
         def neon_apply_aux_recovery_bc(
@@ -320,13 +317,13 @@ def neon_apply_aux_recovery_bc(
             _boundary_id: Any,
             _missing_mask: Any,
             f_0_pn: Any,
-            _f1_thread: Any,
+            f_1_pn: Any,
         ):
             # Note:
             # In XLB, the BC auxiliary data (e.g. prescribed values of pressure or normal velocity) are stored in (i) central index of f_1 and/or
             # (ii) missing directions of f_1. Some BCs may or may not need all these available storage space. This function checks whether
             # the BC needs recovery of auxiliary data and then recovers the information for the next iteration (due to buffer swapping) by
-            # writting the thread values of f_1 (i.e._f1_thread) into f_0.
+            # writting the values of f_1 into f_0.
 
             # Unroll the loop over boundary conditions
             for i in range(wp.static(len(self.boundary_conditions))):
@@ -337,11 +334,13 @@ def neon_apply_aux_recovery_bc(
                             if l == lattice_central_index:
                                 # (i) Recover the values stored in the central index of f_1
                                 # TODO: Add store dtype
-                                wp.neon_write(f_0_pn, index, l, _f1_thread[l])
+                                _f1_thread = wp.neon_read(f_1_pn, index, l)
+                                wp.neon_write(f_0_pn, index, l, _f1_thread)
                             elif _missing_mask[l] == wp.uint8(1):
                                 # (ii) Recover the values stored in the missing directions of f_1
                                 # TODO: Add store dtype
-                                wp.neon_write(f_0_pn, index, _opp_indices[l], _f1_thread[_opp_indices[l]])
+                                _f1_thread = wp.neon_read(f_1_pn, index, _opp_indices[l])
+                                wp.neon_write(f_0_pn, index, _opp_indices[l], _f1_thread)
 
         @neon.Container.factory(name="collide_coarse")
         def collide_coarse(
@@ -379,14 +378,14 @@ def device(index: Any):
                     """
                     The c++ version starts with the following, which I am not sure is right:
                         if (type(cell, 0) == CellType::bulk ) {
-                    CB type cells should do collide too  
+                    BC type cells should do collide too
                     """
                     if _boundary_id == wp.uint8(255):
                         return
 
                     if not wp.neon_has_child(f_0_pn, index):
                         # Read thread data for populations, these are post streaming
-                        _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
+                        _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
                         _f_post_stream = _f0_thread
 
                         _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
@@ -398,6 +397,9 @@ def device(index: Any):
                             index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False
                         )
 
+                        # Apply auxiliary recovery for boundary conditions (swapping) before overwriting f_1
+                        neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
+
                         # Accumulate the post-collision populations in f_0
                         for l in range(self.velocity_set.q):
                             push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
@@ -406,9 +408,6 @@ def device(index: Any):
                                 wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, val)
                                 # Verified that this is not needed: wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, val)
 
-                            # Apply auxiliary recovery for boundary conditions (swapping) before overwriting f_1
-                            neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, _f1_thread)
-
                             wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
                     else:
                         for l in range(self.velocity_set.q):
@@ -465,7 +464,7 @@ def cl_stream_coarse(index: Any):
 
                     # do stream normally
                     _missing_mask = _missing_mask_vec()
-                    _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
+                    _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
                     _f_post_collision = _f0_thread
                     _f_post_stream = self.stream.neon_functional(f_0_pn, index)
 
@@ -526,7 +525,7 @@ def cl_stream_coarse(index: Any):
                     _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
 
                     # Apply auxiliary recovery for boundary conditions (swapping) before overwriting f_1
-                    neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, _f1_thread)
+                    neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
 
                     for l in range(self.velocity_set.q):
                         wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
@@ -575,7 +574,7 @@ def cl_stream_coarse(index: Any):
 
                     # do stream normally
                     _missing_mask = _missing_mask_vec()
-                    _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
+                    _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
                     _f_post_collision = _f0_thread
                     _f_post_stream = self.stream.neon_functional(f_0_pn, index)
 
@@ -712,7 +711,8 @@ def cl_stream_coarse(index: Any):
                     _missing_mask = _missing_mask_vec()
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     # do stream normally
-                    _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
+                    _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
+                    _f1_thread, _missing_mask = neon_get_thread_data(f_1_pn, missing_mask_pn, index)
                     _f_post_collision = _f0_thread
                     _f_post_stream = _f1_thread
 
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index e2a58dbc..0e9280a6 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -260,24 +260,18 @@ def apply_bc(
         @wp.func
         def get_thread_data(
             f0_buffer: wp.array4d(dtype=Any),
-            f1_buffer: wp.array4d(dtype=Any),
             missing_mask: wp.array4d(dtype=Any),
             index: Any,
         ):
             # Read thread data for populations
             _f0_thread = _f_vec()
-            _f1_thread = _f_vec()
             _missing_mask = _missing_mask_vec()
             for l in range(self.velocity_set.q):
                 # q-sized vector of pre-streaming populations
                 _f0_thread[l] = self.compute_dtype(f0_buffer[l, index[0], index[1], index[2]])
-                _f1_thread[l] = self.compute_dtype(f1_buffer[l, index[0], index[1], index[2]])
-                if missing_mask[l, index[0], index[1], index[2]]:
-                    _missing_mask[l] = wp.uint8(1)
-                else:
-                    _missing_mask[l] = wp.uint8(0)
+                _missing_mask[l] = missing_mask[l, index[0], index[1], index[2]]
 
-            return _f0_thread, _f1_thread, _missing_mask
+            return _f0_thread, _missing_mask
 
         @wp.func
         def apply_aux_recovery_bc(
@@ -285,13 +279,13 @@ def apply_aux_recovery_bc(
             _boundary_id: Any,
             _missing_mask: Any,
             f_0: Any,
-            _f1_thread: Any,
+            f_1: Any,
         ):
             # Note:
             # In XLB, the BC auxiliary data (e.g. prescribed values of pressure or normal velocity) are stored in (i) central index of f_1 and/or
             # (ii) missing directions of f_1. Some BCs may or may not need all these available storage space. This function checks whether
             # the BC needs recovery of auxiliary data and then recovers the information for the next iteration (due to buffer swapping) by
-            # writting the thread values of f_1 (i.e._f1_thread) into f_0.
+            # writting the values of f_1 into f_0.
 
             # Unroll the loop over boundary conditions
             for i in range(wp.static(len(self.boundary_conditions))):
@@ -301,10 +295,12 @@ def apply_aux_recovery_bc(
                             # Perform the swapping of data
                             if l == lattice_central_index:
                                 # (i) Recover the values stored in the central index of f_1
-                                f_0[l, index[0], index[1], index[2]] = self.store_dtype(_f1_thread[l])
+                                f_0[l, index[0], index[1], index[2]] = self.store_dtype(f_1[l, index[0], index[1], index[2]])
                             elif _missing_mask[l] == wp.uint8(1):
                                 # (ii) Recover the values stored in the missing directions of f_1
-                                f_0[_opp_indices[l], index[0], index[1], index[2]] = self.store_dtype(_f1_thread[_opp_indices[l]])
+                                f_0[_opp_indices[l], index[0], index[1], index[2]] = self.store_dtype(
+                                    f_1[_opp_indices[l], index[0], index[1], index[2]]
+                                )
 
         @wp.kernel
         def kernel(
@@ -325,7 +321,7 @@ def kernel(
             # Apply streaming
             _f_post_stream = self.stream.warp_functional(f_0, index)
 
-            _f0_thread, _f1_thread, _missing_mask = get_thread_data(f_0, f_1, missing_mask, index)
+            _f0_thread, _missing_mask = get_thread_data(f_0, missing_mask, index)
             _f_post_collision = _f0_thread
 
             # Apply post-streaming boundary conditions
@@ -339,7 +335,7 @@ def kernel(
             _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0, f_1, _f_post_stream, _f_post_collision, False)
 
             # Apply auxiliary recovery for boundary conditions (swapping)
-            apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0, _f1_thread)
+            apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0, f_1)
 
             # Store the result in f_1
             for l in range(self.velocity_set.q):
@@ -407,21 +403,18 @@ def apply_bc(
         @wp.func
         def neon_get_thread_data(
             f0_pn: Any,
-            f1_pn: Any,
             missing_mask_pn: Any,
             index: Any,
         ):
             # Read thread data for populations
             _f0_thread = _f_vec()
-            _f1_thread = _f_vec()
             _missing_mask = _missing_mask_vec()
             for l in range(self.velocity_set.q):
                 # q-sized vector of pre-streaming populations
                 _f0_thread[l] = self.compute_dtype(wp.neon_read(f0_pn, index, l))
-                _f1_thread[l] = self.compute_dtype(wp.neon_read(f1_pn, index, l))
                 _missing_mask[l] = wp.neon_read(missing_mask_pn, index, l)
 
-            return _f0_thread, _f1_thread, _missing_mask
+            return _f0_thread, _missing_mask
 
         @wp.func
         def neon_apply_aux_recovery_bc(
@@ -429,13 +422,13 @@ def neon_apply_aux_recovery_bc(
             _boundary_id: Any,
             _missing_mask: Any,
             f_0_pn: Any,
-            _f1_thread: Any,
+            f_1_pn: Any,
         ):
             # Note:
             # In XLB, the BC auxiliary data (e.g. prescribed values of pressure or normal velocity) are stored in (i) central index of f_1 and/or
             # (ii) missing directions of f_1. Some BCs may or may not need all these available storage space. This function checks whether
             # the BC needs recovery of auxiliary data and then recovers the information for the next iteration (due to buffer swapping) by
-            # writting the thread values of f_1 (i.e._f1_thread) into f_0.
+            # writting the values of f_1 into f_0.
 
             # Unroll the loop over boundary conditions
             for i in range(wp.static(len(self.boundary_conditions))):
@@ -446,11 +439,13 @@ def neon_apply_aux_recovery_bc(
                             if l == lattice_central_index:
                                 # (i) Recover the values stored in the central index of f_1
                                 # TODO: Add store dtype
-                                wp.neon_write(f_0_pn, index, l, _f1_thread[l])
+                                _f1_thread = wp.neon_read(f_1_pn, index, l)
+                                wp.neon_write(f_0_pn, index, l, _f1_thread)
                             elif _missing_mask[l] == wp.uint8(1):
                                 # (ii) Recover the values stored in the missing directions of f_1
                                 # TODO: Add store dtype
-                                wp.neon_write(f_0_pn, index, _opp_indices[l], _f1_thread[_opp_indices[l]])
+                                _f1_thread = wp.neon_read(f_1_pn, index, _opp_indices[l])
+                                wp.neon_write(f_0_pn, index, _opp_indices[l], _f1_thread)
 
         @neon.Container.factory(name="nse_stepper")
         def container(
@@ -480,7 +475,7 @@ def nse_stepper_cl(index: Any):
                     # Apply streaming
                     _f_post_stream = self.stream.neon_functional(f_0_pn, index)
 
-                    _f0_thread, _f1_thread, _missing_mask = neon_get_thread_data(f_0_pn, f_1_pn, missing_mask_pn, index)
+                    _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
                     _f_post_collision = _f0_thread
 
                     # Apply post-streaming boundary conditions
@@ -496,7 +491,7 @@ def nse_stepper_cl(index: Any):
                     )
 
                     # Apply auxiliary recovery for boundary conditions (swapping)
-                    neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, _f1_thread)
+                    neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
 
                     # Store the result in f_1
                     for l in range(self.velocity_set.q):

From dd729ce315664ca64849a07e5f026db437d9a6c9 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Thu, 29 May 2025 13:09:38 -0400
Subject: [PATCH 056/208] fixed BC profile handling for Neon dense

---
 .../boundary_condition/boundary_condition.py        | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/xlb/operator/boundary_condition/boundary_condition.py b/xlb/operator/boundary_condition/boundary_condition.py
index bbc8e8f3..8ebaf482 100644
--- a/xlb/operator/boundary_condition/boundary_condition.py
+++ b/xlb/operator/boundary_condition/boundary_condition.py
@@ -199,7 +199,12 @@ def aux_data_init_cl(index: Any):
                     # Apply the functional
                     if _boundary_id == _id:
                         # prescribed_values is a q-sized vector of type wp.vec
-                        prescribed_values = functional(index)
+                        warp_index = wp.vec3i()
+                        gloabl_index = wp.neon_global_idx(f_0_pn, index)
+                        warp_index[0] = wp.neon_get_x(gloabl_index)
+                        warp_index[1] = wp.neon_get_y(gloabl_index)
+                        warp_index[2] = wp.neon_get_z(gloabl_index)
+                        prescribed_values = functional(warp_index)
 
                     # Write the result for all q directions, but only store up to num_of_aux_data
                     counter = wp.int32(0)
@@ -207,12 +212,14 @@ def aux_data_init_cl(index: Any):
                         # wp.neon_write(f_pn, index, l, self.store_dtype(feq[l]))
                         if l == lattice_central_index:
                             # The first BC auxiliary data is stored in the zero'th index of f_1 associated with its center.
-                            wp.neon_write(f_1_pn, index, l, self.store_dtype(prescribed_values[l]))
+                            # TODO: add self.store_dtype
+                            wp.neon_write(f_1_pn, index, l, prescribed_values[l])
                             counter += 1
                         elif _missing_mask[l] == wp.uint8(1):
                             # The other remaining BC auxiliary data are stored in missing directions of f_1.
                             # Only store up to num_of_aux_data
-                            wp.neon_write(f_1_pn, index, _opp_indices[l], self.store_dtype(prescribed_values[l]))
+                            # TODO: add self.store_dtype
+                            wp.neon_write(f_1_pn, index, _opp_indices[l], prescribed_values[l])
                             counter += 1
                         if counter > _num_of_aux_data:
                             # Only store up to num_of_aux_data

From 26aff01183928dec9b3f9229ecdeab56d1154f3f Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Thu, 29 May 2025 13:34:32 -0400
Subject: [PATCH 057/208] The container should loop over all levels not the
 call to the container

---
 xlb/helper/initializers.py                               | 4 +---
 .../equilibrium/mulltires_quadratic_equilibrium.py       | 9 +++++----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/xlb/helper/initializers.py b/xlb/helper/initializers.py
index 82e62443..6ad4dfa2 100644
--- a/xlb/helper/initializers.py
+++ b/xlb/helper/initializers.py
@@ -26,7 +26,5 @@ def initialize_eq(f, grid, velocity_set, precision_policy, compute_backend, rho=
 
 def initialize_multires_eq(f, grid, velocity_set, precision_policy, backend, rho, u):
     equilibrium = MultiresQuadraticEquilibrium()
-    for level in range(grid.count_levels):
-        print("MultiresQuadraticEquilibrium")
-        equilibrium(rho=rho, u=u, f=f, level=level, stream=0)
+    equilibrium(rho, u, f, stream=0)
     return f
diff --git a/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py b/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
index c4eb43b5..a539216e 100644
--- a/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
+++ b/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
@@ -60,8 +60,9 @@ def quadratic_equilibrium_cl(index: Any):
         return functional, container
 
     @Operator.register_backend(ComputeBackend.NEON)
-    def neon_implementation(self, rho, u, f, level, stream):
-        c = self.neon_container(rho, u, f, level)
-        c.run(stream, container_runtime=neon.Container.ContainerRuntime.neon)
-
+    def neon_implementation(self, rho, u, f, stream=0):
+        grid = f.get_grid()
+        for level in range(grid.num_levels):
+            c = self.neon_container(rho, u, f, level)
+            c.run(stream, container_runtime=neon.Container.ContainerRuntime.neon)
         return f

From 84a8fa4fd4239d41772c2de7897c75d5f3710ffc Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Thu, 29 May 2025 13:52:51 -0400
Subject: [PATCH 058/208] no need to have separate functions for initializing
 and launching containers. Operator definition does that.

---
 xlb/helper/simulation_manager.py                |  6 ++----
 .../macroscopic/multires_macroscopic.py         | 17 +++++------------
 2 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/xlb/helper/simulation_manager.py b/xlb/helper/simulation_manager.py
index bfbac21d..d548a694 100644
--- a/xlb/helper/simulation_manager.py
+++ b/xlb/helper/simulation_manager.py
@@ -1,4 +1,5 @@
 import neon
+import warp as wp
 
 
 class MultiresSimulationManager:
@@ -52,13 +53,10 @@ def __init_containers(self, num_levels):
         self.macroscopics = {}
 
         self.stepper.init_containers()
-        self.macro.init_containers()
 
     def export_macroscopic(self, fname_prefix):
         print(f"exporting macroscopic: #levels {self.grid.count_levels}")
-        self.macro.launch_container(streamId=0, f_0=self.f_0, bc_mask=self.bc_mask, rho=self.rho, u=self.u)
-
-        import warp as wp
+        self.macro(self.f_0, self.bc_mask, self.rho, self.u, streamId=0)
 
         wp.synchronize()
         self.u.update_host(0)
diff --git a/xlb/operator/macroscopic/multires_macroscopic.py b/xlb/operator/macroscopic/multires_macroscopic.py
index 12e5b76b..6900d23e 100644
--- a/xlb/operator/macroscopic/multires_macroscopic.py
+++ b/xlb/operator/macroscopic/multires_macroscopic.py
@@ -75,18 +75,11 @@ def macroscopic_cl(gIdx: typing.Any):
 
         return functional, container
 
-    def init_containers(self):
-        self.containers = None
-        _, self.containers = self._construct_neon()
-
-    def launch_container(self, streamId, f_0, bc_mask, rho, u):
-        grid = f_0.get_grid()
-        for target_level in range(grid.num_levels):
-            self.containers(target_level, f_0, bc_mask, rho, u).run(streamId)
-
     @Operator.register_backend(ComputeBackend.NEON)
-    def neon_implementation(self, f, rho, u):
-        c = self.neon_container(f, rho, u)
-        c.run(0)
+    def neon_implementation(self, f, bc_mask, rho, u, streamId=0):
+        grid = f.get_grid()
+        for level in range(grid.num_levels):
+            c = self.neon_container(level, f, bc_mask, rho, u)
+            c.run(streamId)
         wp.synchronize()
         return rho, u

From 9099f29adf2b85b8ebc1e11cda6dedc8bc837f1d Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Thu, 29 May 2025 23:10:00 -0400
Subject: [PATCH 059/208] Simplified the multires simulation manager a lot by
 inheriting from MultiresStepper

---
 .../grid_refinement/flow_past_sphere_3d.py    |  8 +-
 examples/performance/mlups_3d_multires.py     |  8 +-
 xlb/helper/simulation_manager.py              | 81 ++++++++++---------
 xlb/operator/macroscopic/macroscopic.py       |  1 -
 .../macroscopic/multires_macroscopic.py       |  1 -
 xlb/operator/stepper/nse_multires_stepper.py  | 16 ++--
 6 files changed, 54 insertions(+), 61 deletions(-)

diff --git a/examples/cfd/grid_refinement/flow_past_sphere_3d.py b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
index d0f59ed4..98403b88 100644
--- a/examples/cfd/grid_refinement/flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
@@ -108,16 +108,14 @@ def bc_profile_warp(index: wp.vec3i):
 bc_sphere = HalfwayBounceBackBC(indices=sphere)
 boundary_conditions = [bc_walls, bc_left, bc_outlet, bc_sphere]
 
-# Setup Stepper
-stepper = MultiresIncompressibleNavierStokesStepper(
+# Define a multi-resolution simulation manager
+sim = xlb.helper.MultiresSimulationManager(
+    omega=omega,
     grid=grid,
     boundary_conditions=boundary_conditions,
     collision_type="BGK",
 )
 
-# Define a multi-resolution simulation manager
-sim = xlb.helper.MultiresSimulationManager(grid, velocity_set, stepper, omega)
-
 # -------------------------- Simulation Loop --------------------------
 
 wp.synchronize()
diff --git a/examples/performance/mlups_3d_multires.py b/examples/performance/mlups_3d_multires.py
index 54cf3865..808b13d1 100644
--- a/examples/performance/mlups_3d_multires.py
+++ b/examples/performance/mlups_3d_multires.py
@@ -196,16 +196,14 @@ def run(velocity_set, grid_shape, num_steps):
         FullwayBounceBackBC(indices=walls),
     ]
 
-    # Create stepper
-    stepper = MultiresIncompressibleNavierStokesStepper(grid=grid, boundary_conditions=boundary_conditions, collision_type="KBC")
-
+    # Problem parameters
     Re = 5000.0
     clength = grid_shape[0] - 1
     visc = prescribed_vel * clength / Re
     omega = 1.0 / (3.0 * visc + 0.5)
-    # omega = 1.0
 
-    sim = xlb.helper.MultiresSimulationManager(grid, velocity_set, stepper, omega)
+    # Define a multi-resolution simulation manager
+    sim = xlb.helper.MultiresSimulationManager(omega=omega, grid=grid, boundary_conditions=boundary_conditions, collision_type="KBC")
 
     # sim.export_macroscopic("Initial_")
     # sim.step()
diff --git a/xlb/helper/simulation_manager.py b/xlb/helper/simulation_manager.py
index d548a694..b639e58a 100644
--- a/xlb/helper/simulation_manager.py
+++ b/xlb/helper/simulation_manager.py
@@ -1,21 +1,31 @@
 import neon
 import warp as wp
+from xlb.operator.stepper import MultiresIncompressibleNavierStokesStepper
+from xlb.operator.macroscopic import MultiresMacroscopic
+
+
+class MultiresSimulationManager(MultiresIncompressibleNavierStokesStepper):
+    """
+    A simulation manager for multiresolution simulations using the Neon backend in XLB.
+    """
+
+    def __init__(
+        self,
+        omega,
+        grid,
+        boundary_conditions=[],
+        collision_type="BGK",
+        forcing_scheme="exact_difference",
+        force_vector=None,
+    ):
+        super().__init__(grid, boundary_conditions, collision_type, forcing_scheme, force_vector)
 
-
-class MultiresSimulationManager:
-    def __init__(self, grid, velocity_set, stepper, omega):
-        self.stepper = stepper
-        self.grid = stepper.get_grid()
-        self.precision_policy = stepper.get_precision_policy()
-        self.velocity_set = velocity_set
         self.omega = omega
         self.count_levels = grid.count_levels
         # Create fields
         self.rho = grid.create_field(cardinality=1, dtype=self.precision_policy.store_precision)
         self.u = grid.create_field(cardinality=3, dtype=self.precision_policy.store_precision)
-        self.coalescence_factor = grid.create_field(cardinality=velocity_set.q, dtype=self.precision_policy.store_precision)
-
-        fname_prefix = "test"
+        self.coalescence_factor = grid.create_field(cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision)
 
         for level in range(self.count_levels):
             self.u.fill_run(level, 0.0, 0)
@@ -27,8 +37,9 @@ def __init__(self, grid, velocity_set, stepper, omega):
         # wp.synchronize()
         # self.u.export_vti(f"u_{fname_prefix}_topology.vti", 'u')
 
-        self.f_0, self.f_1, self.bc_mask, self.missing_mask = stepper.prepare_fields(rho=self.rho, u=self.u)
-        stepper.prepare_coalescence_count(coalescence_factor=self.coalescence_factor, bc_mask=self.bc_mask)
+        # Prepare fields
+        self.f_0, self.f_1, self.bc_mask, self.missing_mask = self.prepare_fields(self.rho, self.u)
+        self.prepare_coalescence_count(coalescence_factor=self.coalescence_factor, bc_mask=self.bc_mask)
 
         # wp.synchronize()
         # self.u.update_host(0)
@@ -36,26 +47,18 @@ def __init__(self, grid, velocity_set, stepper, omega):
         # self.u.export_vti(f"u_t2_{fname_prefix}_topology.vti", 'u')
 
         self.iteration_idx = -1
-        from xlb.operator.macroscopic import MultiresMacroscopic
 
         self.macro = MultiresMacroscopic(
-            compute_backend=self.grid.compute_backend,
+            compute_backend=self.compute_backend,
             precision_policy=self.precision_policy,
             velocity_set=self.velocity_set,
         )
 
-        self.__init_containers(self.count_levels)
-        self._step_init()
-
-    def __init_containers(self, num_levels):
-        # working only with level 0 for now
-        self.containers = {}
-        self.macroscopics = {}
-
-        self.stepper.init_containers()
+        # Construct the stepper skeleton
+        self._construct_stepper_skeleton()
 
     def export_macroscopic(self, fname_prefix):
-        print(f"exporting macroscopic: #levels {self.grid.count_levels}")
+        print(f"exporting macroscopic: #levels {self.count_levels}")
         self.macro(self.f_0, self.bc_mask, self.rho, self.u, streamId=0)
 
         wp.synchronize()
@@ -70,17 +73,17 @@ def step(self):
         self.iteration_idx = self.iteration_idx + 1
         self.sk.run()
 
-    # one step at the corase level
-    def _step_init(self):
+    # Construct the stepper skeleton
+    def _construct_stepper_skeleton(self):
         self.app = []
 
-        def recurtion(level, app):
+        def recursion(level, app):
             if level < 0:
                 return
-            print(f"RECURTION down to level {level}")
-            print(f"RECURTION Level {level}, COLLIDE")
+            print(f"RECURSION down to level {level}")
+            print(f"RECURSION Level {level}, COLLIDE")
 
-            self.stepper.add_to_app(
+            self.add_to_app(
                 app=app,
                 op_name="collide_coarse",
                 mres_level=level,
@@ -104,12 +107,12 @@ def recurtion(level, app):
             #     #sys.exit()
             #     pass
 
-            recurtion(level - 1, app)
-            recurtion(level - 1, app)
+            recursion(level - 1, app)
+            recursion(level - 1, app)
 
             # Important: swapping of f_0 and f_1 is done here
-            print(f"RECURTION Level {level}, stream_coarse_step_ABC")
-            self.stepper.add_to_app(
+            print(f"RECURSION Level {level}, stream_coarse_step_ABC")
+            self.add_to_app(
                 app=app,
                 op_name="stream_coarse_step_ABC",
                 mres_level=level,
@@ -120,9 +123,9 @@ def recurtion(level, app):
                 omega=self.coalescence_factor,
                 timestep=0,
             )
-            # print(f"RECURTION Level {level}, stream_coarse_step_B")
+            # print(f"RECURSION Level {level}, stream_coarse_step_B")
             #
-            # self.stepper.add_to_app(
+            # self.add_to_app(
             #     app=app,
             #     op_name="stream_coarse_step_B",
             #     mres_level=level,
@@ -134,9 +137,9 @@ def recurtion(level, app):
             #     timestep=0,
             # )
 
-            # print(f"RECURTION Level {level}, stream_coarse_step_C")
+            # print(f"RECURSION Level {level}, stream_coarse_step_C")
             #
-            # self.stepper.add_to_app(
+            # self.add_to_app(
             #     app=app,
             #     op_name="stream_coarse_step_C",
             #     mres_level=level,
@@ -160,7 +163,7 @@ def recurtion(level, app):
             #     sys.exit()
             #     pass
 
-        recurtion(self.count_levels - 1, app=self.app)
+        recursion(self.count_levels - 1, app=self.app)
         bk = self.grid.get_neon_backend()
         self.sk = neon.Skeleton(backend=bk)
         self.sk.sequence("mres_nse_stepper", self.app)
diff --git a/xlb/operator/macroscopic/macroscopic.py b/xlb/operator/macroscopic/macroscopic.py
index 4fd464ad..61a6ef88 100644
--- a/xlb/operator/macroscopic/macroscopic.py
+++ b/xlb/operator/macroscopic/macroscopic.py
@@ -110,5 +110,4 @@ def macroscopic_cl(gIdx: typing.Any):
     def neon_implementation(self, f, rho, u):
         c = self.neon_container(f, rho, u)
         c.run(0)
-        wp.synchronize()
         return rho, u
diff --git a/xlb/operator/macroscopic/multires_macroscopic.py b/xlb/operator/macroscopic/multires_macroscopic.py
index 6900d23e..6cab84d8 100644
--- a/xlb/operator/macroscopic/multires_macroscopic.py
+++ b/xlb/operator/macroscopic/multires_macroscopic.py
@@ -81,5 +81,4 @@ def neon_implementation(self, f, bc_mask, rho, u, streamId=0):
         for level in range(grid.num_levels):
             c = self.neon_container(level, f, bc_mask, rho, u)
             c.run(streamId)
-        wp.synchronize()
         return rho, u
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 38441c7a..6736197e 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -14,8 +14,8 @@
 from xlb.operator import Operator
 from xlb.operator.stream import Stream
 from xlb.operator.collision import BGK, KBC
-from xlb.operator.equilibrium import QuadraticEquilibrium
-from xlb.operator.macroscopic import Macroscopic
+from xlb.operator.equilibrium import MultiresQuadraticEquilibrium
+from xlb.operator.macroscopic import MultiresMacroscopic
 from xlb.operator.stepper import Stepper
 from xlb.operator.boundary_condition.boundary_condition import ImplementationStep
 from xlb.operator.boundary_condition.boundary_condition_registry import boundary_condition_registry
@@ -46,8 +46,8 @@ def __init__(
 
         # Construct the operators
         self.stream = Stream(self.velocity_set, self.precision_policy, self.compute_backend)
-        self.equilibrium = QuadraticEquilibrium(self.velocity_set, self.precision_policy, self.compute_backend)
-        self.macroscopic = Macroscopic(self.velocity_set, self.precision_policy, self.compute_backend)
+        self.equilibrium = MultiresQuadraticEquilibrium(self.velocity_set, self.precision_policy, self.compute_backend)
+        self.macroscopic = MultiresMacroscopic(self.velocity_set, self.precision_policy, self.compute_backend)
 
     def prepare_fields(self, rho, u, initializer=None):
         """Prepare the fields required for the stepper.
@@ -735,15 +735,11 @@ def cl_stream_coarse(index: Any):
             "stream_coarse_step_C": stream_coarse_step_C,
         }
 
-    def init_containers(self):
-        self.containers = None
-        _, self.containers = self._construct_neon()
-
     def launch_container(self, streamId, op_name, mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep):
-        self.containers[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep).run(0)
+        self.neon_container[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep).run(0)
 
     def add_to_app(self, app, op_name, mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep):
-        app.append(self.containers[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep))
+        app.append(self.neon_container[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep))
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_launch(self, f_0, f_1, bc_mask, missing_mask, omega, timestep):

From 56e51d81f10b0513971c475ddfd6e44acd8c247e Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 2 Jun 2025 16:43:04 -0400
Subject: [PATCH 060/208] Fixed a nasty bug!

---
 .../boundary_masker/indices_boundary_masker.py      | 13 ++++++++++---
 .../boundary_masker/multires_boundary_masker.py     | 13 ++++++++++---
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 379300b1..5e6f1092 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -233,6 +233,9 @@ def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None, x
     def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None, xlb_grid=None):
         import neon
 
+        # Make constants
+        _d = self.velocity_set.d
+
         # Pre-allocate arrays with maximum possible size
         grid_warp = grid_factory(xlb_grid.shape, compute_backend=ComputeBackend.WARP, velocity_set=self.velocity_set)
         missing_mask_warp = grid_warp.create_field(cardinality=self.velocity_set.q, dtype=Precision.UINT8)
@@ -265,12 +268,16 @@ def masker(gridIdx: Any):
                     gx = wp.neon_get_x(cIdx)
                     gy = wp.neon_get_y(cIdx)
                     gz = wp.neon_get_z(cIdx)
-                    # TODO@Max - XLB is flattening the y dimension in 3D, while neon uses the z dimension
-                    local_mask = bc_mask_warp[0, gx, gz, gy]
+
+                    # TODO@Max - XLB is flattening the z dimension in 3D, while neon uses the y dimension
+                    if _d == 2:
+                        gy, gz = gz, gy
+
+                    local_mask = bc_mask_warp[0, gx, gy, gz]
                     wp.neon_write(bc_mask_hdl, gridIdx, 0, local_mask)
 
                     for q in range(self.velocity_set.q):
-                        is_missing = wp.uint8(missing_mask_warp[q, gx, gz, gy])
+                        is_missing = wp.uint8(missing_mask_warp[q, gx, gy, gz])
                         wp.neon_write(missing_mask_hdl, gridIdx, q, is_missing)
 
                 loader.declare_kernel(masker)
diff --git a/xlb/operator/boundary_masker/multires_boundary_masker.py b/xlb/operator/boundary_masker/multires_boundary_masker.py
index f9759f08..65833abb 100644
--- a/xlb/operator/boundary_masker/multires_boundary_masker.py
+++ b/xlb/operator/boundary_masker/multires_boundary_masker.py
@@ -41,6 +41,9 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None, x
         # Ensure that this operator is called on multires grids
         assert bc_mask.get_grid().get_name() == "mGrid", f"Operation {self.__class__.__name} is only applicable to multi-resolution cases"
 
+        # Make constants
+        _d = self.velocity_set.d
+
         # number of levels
         num_levels = bc_mask.get_grid().get_num_levels()
         for level in range(num_levels):
@@ -81,12 +84,16 @@ def masker(gridIdx: typing.Any):
                         lx = wp.neon_get_x(cIdx) // refinement
                         ly = wp.neon_get_y(cIdx) // refinement
                         lz = wp.neon_get_z(cIdx) // refinement
-                        # TODO@Max - XLB is flattening the y dimension in 3D, while neon uses the z dimension
-                        local_mask = bc_mask_warp[0, lx, lz, ly]
+
+                        # TODO@Max - XLB is flattening the z dimension in 3D, while neon uses the y dimension
+                        if _d == 2:
+                            ly, lz = lz, ly
+
+                        local_mask = bc_mask_warp[0, lx, ly, lz]
                         wp.neon_write(bc_mask_hdl, gridIdx, 0, local_mask)
 
                         for q in range(self.velocity_set.q):
-                            is_missing = wp.uint8(missing_mask_warp[q, lx, lz, ly])
+                            is_missing = wp.uint8(missing_mask_warp[q, lx, ly, lz])
                             wp.neon_write(missing_mask_hdl, gridIdx, q, is_missing)
 
                     loader.declare_kernel(masker)

From 948c5316c257067704a11b1333662100c9b3803a Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 3 Jun 2025 10:51:34 -0400
Subject: [PATCH 061/208] no need to pass xlb_grid

---
 xlb/operator/boundary_masker/indices_boundary_masker.py | 9 +++++----
 xlb/operator/stepper/nse_stepper.py                     | 6 +++---
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 5e6f1092..a2b8c7cd 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -210,7 +210,7 @@ def _prepare_warp_kernel_inputs(self, bclist, bc_mask):
         return total_index, wp_indices, wp_id_numbers, wp_is_interior
 
     @Operator.register_backend(ComputeBackend.WARP)
-    def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None, xlb_grid=None):
+    def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
         # prepare warp kernel inputs
         total_index, wp_indices, wp_id_numbers, wp_is_interior = self._prepare_warp_kernel_inputs(bclist, bc_mask)
 
@@ -230,14 +230,15 @@ def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None, x
         return bc_mask, missing_mask
 
     @Operator.register_backend(ComputeBackend.NEON)
-    def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None, xlb_grid=None):
+    def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
         import neon
 
         # Make constants
         _d = self.velocity_set.d
 
         # Pre-allocate arrays with maximum possible size
-        grid_warp = grid_factory(xlb_grid.shape, compute_backend=ComputeBackend.WARP, velocity_set=self.velocity_set)
+        grid_shape = bc_mask.shape[1:]  # (nx, ny) for 2D or (nx, ny, nz) for 3D
+        grid_warp = grid_factory(grid_shape, compute_backend=ComputeBackend.WARP, velocity_set=self.velocity_set)
         missing_mask_warp = grid_warp.create_field(cardinality=self.velocity_set.q, dtype=Precision.UINT8)
         bc_mask_warp = grid_warp.create_field(cardinality=1, dtype=Precision.UINT8)
 
@@ -247,7 +248,7 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None, x
             precision_policy=self.precision_policy,
             compute_backend=ComputeBackend.WARP,
         )
-        bc_mask_warp, missing_mask_warp = indices_masker_warp(bclist, bc_mask_warp, missing_mask_warp, start_index, xlb_grid)
+        bc_mask_warp, missing_mask_warp = indices_masker_warp(bclist, bc_mask_warp, missing_mask_warp, start_index)
         wp.synchronize()
 
         @neon.Container.factory("")
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 0e9280a6..a0788180 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -114,7 +114,7 @@ def prepare_fields(self, initializer=None):
             # u.export_vti("u_f1_init.vti", 'u')
             # rho.export_vti("rho_f1_init.vti", 'rho')
         # Process boundary conditions and update masks
-        bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, bc_mask, missing_mask, xlb_grid=self.grid)
+        bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, bc_mask, missing_mask)
         # Initialize auxiliary data if needed
         f_0, f_1 = self._initialize_auxiliary_data(self.boundary_conditions, f_0, f_1, bc_mask, missing_mask)
         # bc_mask.update_host(0)
@@ -126,7 +126,7 @@ def prepare_fields(self, initializer=None):
         return f_0, f_1, bc_mask, missing_mask
 
     @classmethod
-    def _process_boundary_conditions(cls, boundary_conditions, bc_mask, missing_mask, xlb_grid=None):
+    def _process_boundary_conditions(cls, boundary_conditions, bc_mask, missing_mask):
         """Process boundary conditions and update boundary masks."""
         # Check for boundary condition overlaps
         check_bc_overlaps(boundary_conditions, DefaultConfig.velocity_set.d, DefaultConfig.default_backend)
@@ -141,7 +141,7 @@ def _process_boundary_conditions(cls, boundary_conditions, bc_mask, missing_mask
         bc_with_indices = [bc for bc in boundary_conditions if bc.indices is not None]
         # Process indices-based boundary conditions
         if bc_with_indices:
-            bc_mask, missing_mask = indices_masker(bc_with_indices, bc_mask, missing_mask, xlb_grid=xlb_grid)
+            bc_mask, missing_mask = indices_masker(bc_with_indices, bc_mask, missing_mask)
         # Process mesh-based boundary conditions for 3D
         if DefaultConfig.velocity_set.d == 3 and bc_with_vertices:
             mesh_masker = MeshBoundaryMasker(

From 93e25b7832f668bcd2889e70eab7c3c274ff4feb Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 3 Jun 2025 11:04:28 -0400
Subject: [PATCH 062/208] Added padding to allow non cubic domains.

---
 .../grid_refinement/flow_past_sphere_3d.py    | 56 ++++++++++++++-----
 1 file changed, 43 insertions(+), 13 deletions(-)

diff --git a/examples/cfd/grid_refinement/flow_past_sphere_3d.py b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
index 98403b88..a016805f 100644
--- a/examples/cfd/grid_refinement/flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
@@ -2,24 +2,22 @@
 from xlb.compute_backend import ComputeBackend
 from xlb.precision_policy import PrecisionPolicy
 from xlb.grid import multires_grid_factory
-from xlb.operator.stepper import MultiresIncompressibleNavierStokesStepper
 from xlb.operator.boundary_condition import FullwayBounceBackBC, HalfwayBounceBackBC, RegularizedBC, ExtrapolationOutflowBC, DoNothingBC, ZouHeBC
 import neon
 import warp as wp
 import numpy as np
-import jax.numpy as jnp
 import time
 
 # -------------------------- Simulation Setup --------------------------
 
-omega = 1.6
-grid_shape = (256 // 2, 256 // 2, 256 // 2)
+Re = 500.0
+grid_shape = (512 // 2, 128 // 2, 128 // 2)
 compute_backend = ComputeBackend.NEON
 precision_policy = PrecisionPolicy.FP32FP32
 velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=compute_backend)
 u_max = 0.04
-num_steps = 2000
-post_process_interval = 100
+num_steps = 10000
+post_process_interval = 1000
 
 # Initialize XLB
 xlb.init(
@@ -29,24 +27,51 @@
 )
 
 # Create the multires grid
-# TODO: with rectangular cuboid for the inner box, there are some issues with the
-#       multires_grid_factory. The inner box should be a cube for now!
 nx, ny, nz = grid_shape
-sphere_origin = (nx // 2, ny // 2, nz // 2)
+sphere_origin = (nx // 6, ny // 2 + 1, nz // 2)  # +1 added to make a slight assymmetric offset in y direction for vortex street to emerge.
 sphere_radius = ny // 12
-inner_box_shape = (6 * sphere_radius, 6 * sphere_radius, 6 * sphere_radius)
+inner_box_shape = (12 * sphere_radius, 6 * sphere_radius, 6 * sphere_radius)
 num_levels = 2
-level_1 = np.ones((nx // 2, ny // 2, nz // 2), dtype=int)
+
+
+def pad_to_cube(arr):
+    shape = arr.shape
+    max_dim = max(shape)
+    pad_width = []
+    shift = []
+    for dim in shape:
+        total_pad = max_dim - dim
+        before = total_pad // 2
+        after = total_pad - before
+        pad_width.append((before, after))
+        shift.append(before)
+    return np.pad(arr, pad_width, mode="constant", constant_values=0), shift
+
+
 level_0 = np.ones(inner_box_shape, dtype=int)
+level_1 = np.ones((nx // 2, ny // 2, nz // 2), dtype=int)
+
+# Pad both levels to cubes
+# TODO: with rectangular cuboid for the inner box, there are some issues with the
+#       multires_grid_factory. The inner box should be a cube for now!
+# For now we hack this by padding the level_0 and level_1 to be cubes
+level_0, shift_0 = pad_to_cube(level_0)
+level_1, shift_1 = pad_to_cube(level_1)
+
+# Ensure level_0 is contiguous int32
 level_0 = np.ascontiguousarray(level_0, dtype=np.int32)
+
+# Create the multiresolution grid
 levels = [level_0, level_1]
-level_origins = [((nx - inner_box_shape[0]) // 2, (ny - inner_box_shape[1]) // 2, (nz - inner_box_shape[2]) // 2), (0, 0, 0)]
+shifts = [shift_0, shift_1]
+level_origins = [(sphere_origin[0] - 2 * sphere_radius, ny // 2 - inner_box_shape[1] // 2, nz // 2 - inner_box_shape[2] // 2), (0, 0, 0)]
+new_level_origins = [tuple(max(0, a - b) for a, b in zip(origin, shift)) for origin, shift in zip(level_origins, shifts)]
 
 grid = multires_grid_factory(
     grid_shape,
     velocity_set=velocity_set,
     sparsity_pattern_list=[level_0, level_1],
-    sparsity_pattern_origins=[neon.Index_3d(*level_origins[lvl]) for lvl in range(num_levels)],
+    sparsity_pattern_origins=[neon.Index_3d(*new_level_origins[lvl]) for lvl in range(num_levels)],
 )
 
 # Define Boundary Indices
@@ -108,6 +133,11 @@ def bc_profile_warp(index: wp.vec3i):
 bc_sphere = HalfwayBounceBackBC(indices=sphere)
 boundary_conditions = [bc_walls, bc_left, bc_outlet, bc_sphere]
 
+
+# configure the simulation relaxation time
+visc = 2.0 * u_max * sphere_radius / Re
+omega = 1.0 / (3.0 * visc + 0.5)
+
 # Define a multi-resolution simulation manager
 sim = xlb.helper.MultiresSimulationManager(
     omega=omega,

From 8654fb3871e00ad04b829782e234c527e4c0622c Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 3 Jun 2025 11:09:44 -0400
Subject: [PATCH 063/208] added a mesher utility function for multires

---
 xlb/utils/__init__.py |   1 +
 xlb/utils/mesher.py   | 121 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 122 insertions(+)
 create mode 100644 xlb/utils/mesher.py

diff --git a/xlb/utils/__init__.py b/xlb/utils/__init__.py
index 3c8032e2..722febfa 100644
--- a/xlb/utils/__init__.py
+++ b/xlb/utils/__init__.py
@@ -7,3 +7,4 @@
     voxelize_stl,
     axangle2mat,
 )
+from .mesher import make_cuboid_mesh
diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
new file mode 100644
index 00000000..6b6a8fa0
--- /dev/null
+++ b/xlb/utils/mesher.py
@@ -0,0 +1,121 @@
+import numpy as np
+import open3d as o3d
+
+
+def adjust_bbox(cuboid_max, cuboid_min, voxel_size_coarsest):
+    """
+    Adjust the bounding box to the nearest level 0 grid points that enclose the desired region.
+
+    Args:
+        cuboid_min (np.ndarray): Desired minimum coordinates of the bounding box.
+        cuboid_max (np.ndarray): Desired maximum coordinates of the bounding box.
+        voxel_size_coarsest (float): Voxel size of the coarsest grid (level 0).
+
+    Returns:
+        tuple: (adjusted_min, adjusted_max) snapped to level 0 grid points.
+    """
+    adjusted_min = np.round(cuboid_min / voxel_size_coarsest) * voxel_size_coarsest
+    adjusted_max = np.round(cuboid_max / voxel_size_coarsest) * voxel_size_coarsest
+    return adjusted_min, adjusted_max
+
+
+def make_cuboid_mesh(voxel_size, cuboids, stl_name):
+    """
+    Create a multi-level cuboid mesh with bounding boxes aligned to the level 0 grid.
+    Voxel matrices are set to ones only in regions not covered by finer levels.
+
+    Args:
+        voxel_size (float): Voxel size of the finest grid .
+        cuboids (list): List of multipliers defining each level's domain.
+        stl_name (str): Path to the STL file.
+
+    Returns:
+        list: Level data with voxel matrices, voxel sizes, origins, and levels.
+    """
+    # Load the mesh and get its bounding box
+    mesh = o3d.io.read_triangle_mesh(stl_name)
+    if mesh.is_empty():
+        raise ValueError("Loaded mesh is empty or invalid.")
+
+    aabb = mesh.get_axis_aligned_bounding_box()
+    min_bound = aabb.get_min_bound()
+    max_bound = aabb.get_max_bound()
+    partSize = max_bound - min_bound
+
+    level_data = []
+    adjusted_bboxes = []
+    max_voxel_size = voxel_size * pow(2, (len(cuboids) - 1))
+    # Step 1: Generate all levels and store their data
+    for level in range(len(cuboids)):
+        # Compute desired bounding box for this level
+        cuboid_min = np.array(
+            [
+                min_bound[0] - cuboids[level][0] * partSize[0],
+                min_bound[1] - cuboids[level][2] * partSize[1],
+                min_bound[2] - cuboids[level][4] * partSize[2],
+            ],
+            dtype=float,
+        )
+
+        cuboid_max = np.array(
+            [
+                max_bound[0] + cuboids[level][1] * partSize[0],
+                max_bound[1] + cuboids[level][3] * partSize[1],
+                max_bound[2] + cuboids[level][5] * partSize[2],
+            ],
+            dtype=float,
+        )
+
+        # Set voxel size for this level
+        voxel_size_level = max_voxel_size / pow(2, level)
+        if level > 0:
+            voxel_level_up = max_voxel_size / pow(2, level - 1)
+        else:
+            voxel_level_up = voxel_size_level
+        # Adjust bounding box to align with level 0 grid
+        adjusted_min, adjusted_max = adjust_bbox(cuboid_max, cuboid_min, voxel_level_up)
+
+        xmin, ymin, zmin = adjusted_min
+        xmax, ymax, zmax = adjusted_max
+
+        cuboid = adjusted_max - adjusted_min
+
+        # Compute number of voxels based on level-specific voxel size
+        nx = int(np.round((xmax - xmin) / voxel_size_level))
+        ny = int(np.round((ymax - ymin) / voxel_size_level))
+        nz = int(np.round((zmax - zmin) / voxel_size_level))
+        print(f"Domain {nx}, {ny}, {nz}  Origin {adjusted_min}  Voxel Size {voxel_size_level} Voxel Level Up {voxel_level_up}")
+
+        voxel_matrix = np.ones((nx, ny, nz), dtype=bool)
+
+        origin = adjusted_min
+        level_data.append((voxel_matrix, voxel_size_level, origin, level))
+        adjusted_bboxes.append((adjusted_min, adjusted_max))
+
+    # Step 2: Adjust coarser levels to exclude regions covered by finer levels
+    for k in range(len(level_data) - 1):  # Exclude the finest level
+        # Current level's data
+        voxel_matrix_k = level_data[k][0]
+        origin_k = level_data[k][2]
+        voxel_size_k = level_data[k][1]
+        nx, ny, nz = voxel_matrix_k.shape
+
+        # Next finer level's bounding box
+        adjusted_min_k1, adjusted_max_k1 = adjusted_bboxes[k + 1]
+
+        # Compute index ranges in level k that overlap with level k+1's bounding box
+        # Use epsilon (1e-10) to handle floating-point precision
+        i_start = max(0, int(np.ceil((adjusted_min_k1[0] - origin_k[0] - 1e-10) / voxel_size_k)))
+        i_end = min(nx, int(np.floor((adjusted_max_k1[0] - origin_k[0] + 1e-10) / voxel_size_k)))
+        j_start = max(0, int(np.ceil((adjusted_min_k1[1] - origin_k[1] - 1e-10) / voxel_size_k)))
+        j_end = min(ny, int(np.floor((adjusted_max_k1[1] - origin_k[1] + 1e-10) / voxel_size_k)))
+        k_start = max(0, int(np.ceil((adjusted_min_k1[2] - origin_k[2] - 1e-10) / voxel_size_k)))
+        k_end = min(nz, int(np.floor((adjusted_max_k1[2] - origin_k[2] + 1e-10) / voxel_size_k)))
+
+        # Set overlapping region to zero
+        voxel_matrix_k[i_start:i_end, j_start:j_end, k_start:k_end] = 0
+
+    # Step 3 Convert to Indices from STL units
+    level_data = [(dr, int(v / voxel_size), np.round(dOrigin / voxel_size).astype(int), l) for dr, v, dOrigin, l in level_data]
+
+    return level_data

From 48139cec5e6f4d902b7065663be37920c305fb0b Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 3 Jun 2025 11:10:54 -0400
Subject: [PATCH 064/208] enabling mesh masker for multires cases

---
 .../grid_refinement/flow_past_sphere_3d.py    |  2 +-
 .../multires_boundary_masker.py               | 14 +++++++++----
 xlb/operator/stepper/nse_multires_stepper.py  | 20 +++----------------
 3 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/examples/cfd/grid_refinement/flow_past_sphere_3d.py b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
index a016805f..700967c4 100644
--- a/examples/cfd/grid_refinement/flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
@@ -28,7 +28,7 @@
 
 # Create the multires grid
 nx, ny, nz = grid_shape
-sphere_origin = (nx // 6, ny // 2 + 1, nz // 2)  # +1 added to make a slight assymmetric offset in y direction for vortex street to emerge.
+sphere_origin = (nx // 6, ny // 2, nz // 2)
 sphere_radius = ny // 12
 inner_box_shape = (12 * sphere_radius, 6 * sphere_radius, 6 * sphere_radius)
 num_levels = 2
diff --git a/xlb/operator/boundary_masker/multires_boundary_masker.py b/xlb/operator/boundary_masker/multires_boundary_masker.py
index 65833abb..0728607d 100644
--- a/xlb/operator/boundary_masker/multires_boundary_masker.py
+++ b/xlb/operator/boundary_masker/multires_boundary_masker.py
@@ -55,15 +55,21 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None, x
             bc_mask_warp = grid_dense.create_field(cardinality=1, dtype=Precision.UINT8)
 
             # create a new bclist for this level only
-            bclist_level = []
+            bc_with_indices = []
             for bc in bclist:
                 if bc.indices is not None and bc.indices[level]:
                     bc_copy = copy.copy(bc)  # shallow copy of the whole object
                     bc_copy.indices = copy.deepcopy(bc.indices[level])  # deep copy only the modified part
-                    bclist_level.append(bc_copy)
+                    bc_with_indices.append(bc_copy)
+                elif bc.mesh_vertices is not None:
+                    bc_copy = copy.copy(bc)  # shallow copy of the whole object
+                    bc_copy.mesh_vertices = copy.deepcopy(bc.mesh_vertices)
+
+                    # call mesh masker for this bc at this level
+                    bc_mask_warp, missing_mask_warp = self.mesh_masker(bc_copy, bc_mask_warp, missing_mask_warp)
 
-            # call indices masker for this level
-            bc_mask_warp, missing_mask_warp = self.indices_masker(bclist_level, bc_mask_warp, missing_mask_warp, start_index, xlb_grid)
+            # call indices masker for all BC's with indices at this level
+            bc_mask_warp, missing_mask_warp = self.indices_masker(bc_with_indices, bc_mask_warp, missing_mask_warp, start_index)
 
             @neon.Container.factory(name="MultiresBoundaryMasker")
             def container(
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 6736197e..3e53dbc1 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -218,23 +218,9 @@ def _process_boundary_conditions(cls, boundary_conditions, bc_mask, missing_mask
             precision_policy=DefaultConfig.default_precision_policy,
             compute_backend=DefaultConfig.default_backend,
         )
-        # Split boundary conditions by type
-        bc_with_vertices = [bc for bc in boundary_conditions if bc.mesh_vertices is not None]
-        bc_with_indices = [bc for bc in boundary_conditions if bc.indices is not None]
-        # Process indices-based boundary conditions
-        if bc_with_indices:
-            bc_mask, missing_mask = mres_masker(bc_with_indices, bc_mask, missing_mask, xlb_grid=xlb_grid)
-        # Process mesh-based boundary conditions for 3D
-        if DefaultConfig.velocity_set.d == 3 and bc_with_vertices:
-            # throw an exception because this option is not implemented yet
-            raise Exception("Mesh-based boundary conditions are not implemented yet")
-            # mesh_masker = MeshBoundaryMasker(
-            #     velocity_set=DefaultConfig.velocity_set,
-            #     precision_policy=DefaultConfig.default_precision_policy,
-            #     compute_backend=DefaultConfig.default_backend,
-            # )
-            # for bc in bc_with_vertices:
-            #     bc_mask, missing_mask = mesh_masker(bc, bc_mask, missing_mask)
+
+        # Process all boundary conditions, either defined by indices or mesh_vertices
+        bc_mask, missing_mask = mres_masker(boundary_conditions, bc_mask, missing_mask, xlb_grid=xlb_grid)
 
         return bc_mask, missing_mask
 

From 343e7df7dc6a6482252ddb9dc6ecc82a20d4f10f Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 3 Jun 2025 17:36:14 -0400
Subject: [PATCH 065/208] fixed the output order of cuboid mesher

---
 xlb/utils/mesher.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index 6b6a8fa0..ed6f971b 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -118,4 +118,5 @@ def make_cuboid_mesh(voxel_size, cuboids, stl_name):
     # Step 3 Convert to Indices from STL units
     level_data = [(dr, int(v / voxel_size), np.round(dOrigin / voxel_size).astype(int), l) for dr, v, dOrigin, l in level_data]
 
-    return level_data
+    # Reverse to have finest level first
+    return list(reversed(level_data))

From 57c99cd3a0e44d619c41ed4248512900e303bab3 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 3 Jun 2025 17:36:45 -0400
Subject: [PATCH 066/208] Added a new example to showcase STL handling in MRES

---
 .../grid_refinement/flow_past_sphere_3d.py    |  18 +-
 .../stl_flow_past_sphere_3d.py                | 228 ++++++++++++++++++
 2 files changed, 234 insertions(+), 12 deletions(-)
 create mode 100644 examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py

diff --git a/examples/cfd/grid_refinement/flow_past_sphere_3d.py b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
index 700967c4..dba86d3d 100644
--- a/examples/cfd/grid_refinement/flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
@@ -38,14 +38,10 @@ def pad_to_cube(arr):
     shape = arr.shape
     max_dim = max(shape)
     pad_width = []
-    shift = []
     for dim in shape:
         total_pad = max_dim - dim
-        before = total_pad // 2
-        after = total_pad - before
-        pad_width.append((before, after))
-        shift.append(before)
-    return np.pad(arr, pad_width, mode="constant", constant_values=0), shift
+        pad_width.append((0, total_pad))
+    return np.pad(arr, pad_width, mode="constant", constant_values=0)
 
 
 level_0 = np.ones(inner_box_shape, dtype=int)
@@ -55,23 +51,21 @@ def pad_to_cube(arr):
 # TODO: with rectangular cuboid for the inner box, there are some issues with the
 #       multires_grid_factory. The inner box should be a cube for now!
 # For now we hack this by padding the level_0 and level_1 to be cubes
-level_0, shift_0 = pad_to_cube(level_0)
-level_1, shift_1 = pad_to_cube(level_1)
+level_0 = pad_to_cube(level_0)
+level_1 = pad_to_cube(level_1)
 
 # Ensure level_0 is contiguous int32
 level_0 = np.ascontiguousarray(level_0, dtype=np.int32)
 
 # Create the multiresolution grid
 levels = [level_0, level_1]
-shifts = [shift_0, shift_1]
 level_origins = [(sphere_origin[0] - 2 * sphere_radius, ny // 2 - inner_box_shape[1] // 2, nz // 2 - inner_box_shape[2] // 2), (0, 0, 0)]
-new_level_origins = [tuple(max(0, a - b) for a, b in zip(origin, shift)) for origin, shift in zip(level_origins, shifts)]
 
 grid = multires_grid_factory(
     grid_shape,
     velocity_set=velocity_set,
     sparsity_pattern_list=[level_0, level_1],
-    sparsity_pattern_origins=[neon.Index_3d(*new_level_origins[lvl]) for lvl in range(num_levels)],
+    sparsity_pattern_origins=[neon.Index_3d(*level_origins[lvl]) for lvl in range(num_levels)],
 )
 
 # Define Boundary Indices
@@ -134,7 +128,7 @@ def bc_profile_warp(index: wp.vec3i):
 boundary_conditions = [bc_walls, bc_left, bc_outlet, bc_sphere]
 
 
-# configure the simulation relaxation time
+# Configure the simulation relaxation time
 visc = 2.0 * u_max * sphere_radius / Re
 omega = 1.0 / (3.0 * visc + 0.5)
 
diff --git a/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
new file mode 100644
index 00000000..e72d7ce8
--- /dev/null
+++ b/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
@@ -0,0 +1,228 @@
+import xlb
+from xlb.compute_backend import ComputeBackend
+from xlb.precision_policy import PrecisionPolicy
+from xlb.grid import multires_grid_factory
+from xlb.operator.stepper import MultiresIncompressibleNavierStokesStepper
+from xlb.operator.boundary_condition import FullwayBounceBackBC, HalfwayBounceBackBC, RegularizedBC, ExtrapolationOutflowBC, DoNothingBC, ZouHeBC
+from xlb.utils import make_cuboid_mesh
+import neon
+import warp as wp
+import numpy as np
+import time
+
+
+def generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part):
+    """
+    Generate a cuboid mesh based on the provided voxel size and domain multipliers.
+    """
+    import open3d as o3d
+    import os
+
+    # Domain multipliers for each refinement level
+    # First entry should be full domain size
+    domainMultiplier = [
+        [15 // 2, 15 // 2, 7 // 2, 7 // 2, 7 // 2, 7 // 2],  # -x, x, -y, y, -z, z
+        [6 // 2, 8 // 2, 5 // 2, 5 // 2, 5 // 2, 5 // 2],  # -x, x, -y, y, -z, z
+        # [4, 6, 4, 4, 4, 4],
+        # [2, 4, 2, 2, 2, 2],
+        # [1, 2, 1, 1, 1, 1],
+        # [0.4, 1, 0.4, 0.4, 0.4, 0.4],
+        # [0.2, 0.4, 0.2, 0.2, 0.2, 0.2],
+    ]
+
+    # Load the mesh
+    mesh = o3d.io.read_triangle_mesh(stl_filename)
+    if mesh.is_empty():
+        raise ValueError("Loaded mesh is empty or invalid.")
+
+    # Compute original bounds
+    aabb = mesh.get_axis_aligned_bounding_box()
+    min_bound = aabb.get_min_bound()
+    max_bound = aabb.get_max_bound()
+    partSize = max_bound - min_bound
+
+    # smallest voxel size
+    voxel_size = min(partSize) / num_finest_voxels_across_part
+
+    # Infer the finest grid shape from the first entry of the domainMultiplier
+    nx_finest = (domainMultiplier[0][0] + domainMultiplier[0][1]) * num_finest_voxels_across_part
+    ny_finest = (domainMultiplier[0][2] + domainMultiplier[0][3]) * num_finest_voxels_across_part
+    nz_finest = (domainMultiplier[0][4] + domainMultiplier[0][5]) * num_finest_voxels_across_part
+    grid_shape_finest = (nx_finest, ny_finest, nz_finest)
+
+    # Compute translation to put mesh into first octant of that domain—
+    shift = np.array(
+        [
+            domainMultiplier[0][0] * partSize[0] - min_bound[0],
+            domainMultiplier[0][2] * partSize[1] - min_bound[1],
+            domainMultiplier[0][4] * partSize[2] - min_bound[2],
+        ],
+        dtype=float,
+    )
+
+    # Apply translation and save out temp stl
+    mesh.translate(shift)
+    mesh.compute_vertex_normals()
+    mesh_vertices = np.asarray(mesh.vertices)
+    o3d.io.write_triangle_mesh("temp.stl", mesh)
+
+    # Mesh base don temp stl
+    level_data = make_cuboid_mesh(voxel_size, domainMultiplier, "temp.stl")
+    os.remove("temp.stl")
+
+    return level_data, mesh_vertices, grid_shape_finest
+
+
+def prepare_sparsity_pattern(level_data):
+    """
+    Prepare the sparsity pattern for the multiresolution grid based on the level data. "level_data" is expected to be formatted as in
+    the output of "make_cuboid_mesh".
+    """
+
+    def pad_to_cube(arr):
+        shape = arr.shape
+        max_dim = max(shape)
+        pad_width = []
+        for dim in shape:
+            total_pad = max_dim - dim
+            pad_width.append((0, total_pad))
+        return np.pad(arr, pad_width, mode="constant", constant_values=0)
+
+    num_levels = len(level_data)
+    sparsity_pattern = []
+    level_origins = []
+    for lvl in range(num_levels):
+        # Get the level mask from the level data
+        level_mask = level_data[lvl][0]
+
+        # Ensure level_0 is contiguous int32
+        level_mask = np.ascontiguousarray(level_mask, dtype=np.int32)
+
+        # Pad level to be cubes (TODO: this is a hack, the inner box should be a cube for now!)
+        level_mask = pad_to_cube(level_mask)
+
+        # Append the padded level mask to the sparsity pattern
+        sparsity_pattern.append(level_mask)
+
+        # Get the origin for this level
+        level_origins.append(level_data[lvl][2])
+
+    return sparsity_pattern, level_origins
+
+
+# -------------------------- Simulation Setup --------------------------
+
+# The following parameters define the resolution of the voxelized grid
+num_finest_voxels_across_part = 10
+
+# Other setup parameters
+Re = 500.0
+compute_backend = ComputeBackend.NEON
+precision_policy = PrecisionPolicy.FP32FP32
+velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=compute_backend)
+u_max = 0.04
+num_steps = 1
+post_process_interval = 100
+
+# Initialize XLB
+xlb.init(
+    velocity_set=velocity_set,
+    default_backend=compute_backend,
+    default_precision_policy=precision_policy,
+)
+
+# Generate the cuboid mesh and sphere vertices
+stl_filename = "examples/cfd/stl-files/sphere.stl"
+level_data, sphere, grid_shape_finest = generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part)
+
+# Convert level data to the format expected by multires_grid_factory
+sparsity_pattern, level_origins = prepare_sparsity_pattern(level_data)
+
+# Create the multiresolution grid
+num_levels = len(level_data)
+
+grid = multires_grid_factory(
+    grid_shape_finest,
+    velocity_set=velocity_set,
+    sparsity_pattern_list=sparsity_pattern,
+    sparsity_pattern_origins=[neon.Index_3d(*level_origins[lvl]) for lvl in range(num_levels)],
+)
+
+# Define Boundary Indices
+coarsest_level = grid.count_levels - 1
+box = grid.bounding_box_indices(shape=grid.level_to_shape(coarsest_level))
+box_no_edge = grid.bounding_box_indices(shape=grid.level_to_shape(coarsest_level), remove_edges=True)
+inlet = box_no_edge["left"]
+outlet = box_no_edge["right"]
+walls = [box["bottom"][i] + box["top"][i] + box["front"][i] + box["back"][i] for i in range(velocity_set.d)]
+walls = np.unique(np.array(walls), axis=-1).tolist()
+
+
+# Define Boundary Conditions
+def bc_profile():
+    assert compute_backend == ComputeBackend.NEON
+
+    # Note nx, ny, nz are the dimensions of the grid at the finest level while the inlet is defined at the coarsest level
+    nx, ny, nz = grid_shape_finest
+    H_y = float(ny // 2 ** (num_levels - 1) - 1)  # Height in y direction
+    H_z = float(nz // 2 ** (num_levels - 1) - 1)  # Height in z direction
+
+    @wp.func
+    def bc_profile_warp(index: wp.vec3i):
+        # Poiseuille flow profile: parabolic velocity distribution
+        y = wp.float32(index[1])
+        z = wp.float32(index[2])
+
+        # Calculate normalized distance from center
+        y_center = y - (H_y / 2.0)
+        z_center = z - (H_z / 2.0)
+        r_squared = (2.0 * y_center / H_y) ** 2.0 + (2.0 * z_center / H_z) ** 2.0
+
+        # Parabolic profile: u = u_max * (1 - r²)
+        return wp.vec(u_max * wp.max(0.0, 1.0 - r_squared), length=1)
+
+    return bc_profile_warp
+
+
+# Convert bc indices to a list of list (first entry corresponds to the finest level)
+inlet = [[] for _ in range(num_levels)] + [inlet]
+outlet = [[] for _ in range(num_levels)] + [outlet]
+walls = [[] for _ in range(num_levels)] + [walls]
+
+# Initialize Boundary Conditions
+bc_left = RegularizedBC("velocity", profile=bc_profile(), indices=inlet)
+# Alternatively, use a prescribed velocity profile
+# bc_left = RegularizedBC("velocity", prescribed_value=(u_max, 0.0, 0.0), indices=inlet)
+bc_walls = FullwayBounceBackBC(indices=walls)  # TODO: issues with halfway bounce back only here!
+# bc_outlet = ExtrapolationOutflowBC(indices=outlet)
+bc_outlet = DoNothingBC(indices=outlet)
+bc_sphere = HalfwayBounceBackBC(mesh_vertices=sphere)
+boundary_conditions = [bc_walls, bc_left, bc_outlet, bc_sphere]
+
+# Configure the simulation relaxation time
+visc = u_max * num_finest_voxels_across_part / Re
+omega = 1.0 / (3.0 * visc + 0.5)
+
+# Define a multi-resolution simulation manager
+sim = xlb.helper.MultiresSimulationManager(
+    omega=omega,
+    grid=grid,
+    boundary_conditions=boundary_conditions,
+    collision_type="BGK",
+)
+
+# -------------------------- Simulation Loop --------------------------
+
+wp.synchronize()
+start_time = time.time()
+for step in range(num_steps):
+    sim.step()
+
+    if step % post_process_interval == 0 or step == num_steps - 1:
+        # TODO: Issues in the vtk output for rectangular cuboids (as if a duboid grid with the largest side is assumed)
+        sim.export_macroscopic("multires_flow_over_sphere_3d_")
+        wp.synchronize()
+        end_time = time.time()
+        elapsed = end_time - start_time
+        print(f"Completed step {step}. Time elapsed for {post_process_interval} steps: {elapsed:.6f} seconds.")
+        start_time = time.time()

From a58206c63eb875a6e78fc85a6dc7117df2fdfce7 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 3 Jun 2025 21:51:59 -0400
Subject: [PATCH 067/208] stl handling in MRES working reasonably with 2 levels

---
 .../grid_refinement/stl_flow_past_sphere_3d.py   | 16 ++++++++--------
 .../boundary_masker/multires_boundary_masker.py  |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
index e72d7ce8..2d1844de 100644
--- a/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
@@ -21,9 +21,9 @@ def generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part):
     # Domain multipliers for each refinement level
     # First entry should be full domain size
     domainMultiplier = [
-        [15 // 2, 15 // 2, 7 // 2, 7 // 2, 7 // 2, 7 // 2],  # -x, x, -y, y, -z, z
-        [6 // 2, 8 // 2, 5 // 2, 5 // 2, 5 // 2, 5 // 2],  # -x, x, -y, y, -z, z
-        # [4, 6, 4, 4, 4, 4],
+        # [15, 15, 7, 7, 7, 7],  # -x, x, -y, y, -z, z
+        [6, 8, 5, 5, 5, 5],  # -x, x, -y, y, -z, z
+        [4, 6, 4, 4, 4, 4],
         # [2, 4, 2, 2, 2, 2],
         # [1, 2, 1, 1, 1, 1],
         # [0.4, 1, 0.4, 0.4, 0.4, 0.4],
@@ -63,7 +63,7 @@ def generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part):
     # Apply translation and save out temp stl
     mesh.translate(shift)
     mesh.compute_vertex_normals()
-    mesh_vertices = np.asarray(mesh.vertices)
+    mesh_vertices = np.asarray(mesh.vertices) / voxel_size
     o3d.io.write_triangle_mesh("temp.stl", mesh)
 
     # Mesh base don temp stl
@@ -121,7 +121,7 @@ def pad_to_cube(arr):
 precision_policy = PrecisionPolicy.FP32FP32
 velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=compute_backend)
 u_max = 0.04
-num_steps = 1
+num_steps = 1000
 post_process_interval = 100
 
 # Initialize XLB
@@ -185,9 +185,9 @@ def bc_profile_warp(index: wp.vec3i):
 
 
 # Convert bc indices to a list of list (first entry corresponds to the finest level)
-inlet = [[] for _ in range(num_levels)] + [inlet]
-outlet = [[] for _ in range(num_levels)] + [outlet]
-walls = [[] for _ in range(num_levels)] + [walls]
+inlet = [[] for _ in range(num_levels - 1)] + [inlet]
+outlet = [[] for _ in range(num_levels - 1)] + [outlet]
+walls = [[] for _ in range(num_levels - 1)] + [walls]
 
 # Initialize Boundary Conditions
 bc_left = RegularizedBC("velocity", profile=bc_profile(), indices=inlet)
diff --git a/xlb/operator/boundary_masker/multires_boundary_masker.py b/xlb/operator/boundary_masker/multires_boundary_masker.py
index 0728607d..0de211ab 100644
--- a/xlb/operator/boundary_masker/multires_boundary_masker.py
+++ b/xlb/operator/boundary_masker/multires_boundary_masker.py
@@ -63,7 +63,7 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None, x
                     bc_with_indices.append(bc_copy)
                 elif bc.mesh_vertices is not None:
                     bc_copy = copy.copy(bc)  # shallow copy of the whole object
-                    bc_copy.mesh_vertices = copy.deepcopy(bc.mesh_vertices)
+                    bc_copy.mesh_vertices = copy.deepcopy(bc.mesh_vertices) / refinement
 
                     # call mesh masker for this bc at this level
                     bc_mask_warp, missing_mask_warp = self.mesh_masker(bc_copy, bc_mask_warp, missing_mask_warp)

From 991fab17039a388803a4f34bd6b22b176a1f52fd Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 9 Jun 2025 09:33:18 -0400
Subject: [PATCH 068/208] parameterized problem 2

---
 examples/performance/mlups_3d_multires.py | 36 +++++++++++++++--------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/examples/performance/mlups_3d_multires.py b/examples/performance/mlups_3d_multires.py
index 808b13d1..ab059032 100644
--- a/examples/performance/mlups_3d_multires.py
+++ b/examples/performance/mlups_3d_multires.py
@@ -142,36 +142,46 @@ def get_levels(num_levels):
     walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in range(len(grid.shape))]
     walls = np.unique(np.array(walls), axis=-1).tolist()
     # convert bc indices to a list of list, where the first entry of the list corresponds to the finest level
-    lid = [lid, [], [], []]
-    walls = [walls, [], [], []]
+    lid = [lid] + [[] for _ in range(num_levels - 1)]
+    walls = [walls] + [[] for _ in range(num_levels - 1)]
     return grid, lid, walls
 
 
 def problem2(grid_shape, velocity_set):
     # Example 2: Coarsest at the edges (2 level only)
-    num_levels = 2
-    level_1 = np.ones((grid_shape[0] // 2, grid_shape[1] // 2, grid_shape[2] // 2), dtype=int)
-    finestLevel = np.ones((40, 40, 40), dtype=int)
-    finestLevel = np.ascontiguousarray(finestLevel, dtype=np.int32)
-    levels = [finestLevel, level_1]
-    level_origins = [(44, 44, 44), (0, 0, 0)]
+    num_levels = 4
+    level_origins = []
+    level_list = []
+    for lvl in range(num_levels):
+        divider = 2**lvl
+        growth = 1.5**lvl
+        shape = grid_shape[0] // divider, grid_shape[1] // divider, grid_shape[2] // divider
+        if lvl == num_levels - 1:
+            level = np.ascontiguousarray(np.ones(shape, dtype=int), dtype=np.int32)
+            box_origin = (0, 0, 0)  # The coarsest level has no origin offset
+        else:
+            box_size = tuple([int(shape[i] // 4 * growth) for i in range(3)])
+            box_origin = tuple([shape[i] // 2 - box_size[i] // 2 for i in range(3)])
+            level = np.ascontiguousarray(np.ones(box_size, dtype=int), dtype=np.int32)
+        level_list.append(level)
+        level_origins.append(neon.Index_3d(*box_origin))
 
     # Create the multires grid
     grid = multires_grid_factory(
         grid_shape,
         velocity_set=velocity_set,
-        sparsity_pattern_list=levels,
-        sparsity_pattern_origins=[neon.Index_3d(*level_origins[lvl]) for lvl in range(num_levels)],
+        sparsity_pattern_list=level_list,
+        sparsity_pattern_origins=level_origins,
     )
 
-    box = grid.bounding_box_indices(shape=grid.level_to_shape(1))
+    box = grid.bounding_box_indices(shape=grid.level_to_shape(num_levels - 1))
     box_no_edge = grid.bounding_box_indices(shape=grid.level_to_shape(1), remove_edges=True)
     lid = box_no_edge["top"]
     walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in range(len(grid.shape))]
     walls = np.unique(np.array(walls), axis=-1).tolist()
     # convert bc indices to a list of list, where the first entry of the list corresponds to the finest level
-    lid = [[], lid]
-    walls = [[], walls]
+    lid = [[] for _ in range(num_levels - 1)] + [lid]
+    walls = [[] for _ in range(num_levels - 1)] + [walls]
     return grid, lid, walls
 
 

From 62801fdf7abbdd0a33532f312eb50abca359b007 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 9 Jun 2025 21:28:04 -0400
Subject: [PATCH 069/208] No need for padding after Neon fix. Updated multi-res
 exampels.

---
 .../grid_refinement/flow_past_sphere_3d.py    |  64 ++++-----
 .../stl_flow_past_sphere_3d.py                | 122 +++++++-----------
 2 files changed, 70 insertions(+), 116 deletions(-)

diff --git a/examples/cfd/grid_refinement/flow_past_sphere_3d.py b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
index dba86d3d..57adac64 100644
--- a/examples/cfd/grid_refinement/flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
@@ -29,43 +29,29 @@
 # Create the multires grid
 nx, ny, nz = grid_shape
 sphere_origin = (nx // 6, ny // 2, nz // 2)
-sphere_radius = ny // 12
-inner_box_shape = (12 * sphere_radius, 6 * sphere_radius, 6 * sphere_radius)
-num_levels = 2
-
-
-def pad_to_cube(arr):
-    shape = arr.shape
-    max_dim = max(shape)
-    pad_width = []
-    for dim in shape:
-        total_pad = max_dim - dim
-        pad_width.append((0, total_pad))
-    return np.pad(arr, pad_width, mode="constant", constant_values=0)
-
-
-level_0 = np.ones(inner_box_shape, dtype=int)
-level_1 = np.ones((nx // 2, ny // 2, nz // 2), dtype=int)
-
-# Pad both levels to cubes
-# TODO: with rectangular cuboid for the inner box, there are some issues with the
-#       multires_grid_factory. The inner box should be a cube for now!
-# For now we hack this by padding the level_0 and level_1 to be cubes
-level_0 = pad_to_cube(level_0)
-level_1 = pad_to_cube(level_1)
-
-# Ensure level_0 is contiguous int32
-level_0 = np.ascontiguousarray(level_0, dtype=np.int32)
-
-# Create the multiresolution grid
-levels = [level_0, level_1]
-level_origins = [(sphere_origin[0] - 2 * sphere_radius, ny // 2 - inner_box_shape[1] // 2, nz // 2 - inner_box_shape[2] // 2), (0, 0, 0)]
+sphere_radius = min(nx, ny, nz) // 12  # Radius of the sphere
+num_levels = 3
+level_origins = []
+level_list = []
+for lvl in range(num_levels):
+    divider = 2**lvl
+    growth = 1.5**lvl
+    shape = grid_shape[0] // divider, grid_shape[1] // divider, grid_shape[2] // divider
+    if lvl == num_levels - 1:
+        level = np.ascontiguousarray(np.ones(shape, dtype=int), dtype=np.int32)
+        box_origin = (0, 0, 0)  # The coarsest level has no origin offset
+    else:
+        box_size = tuple([int(shape[i] // 4 * growth) for i in range(3)])
+        box_origin = tuple([sphere_origin[0] // divider - 4 * sphere_radius // divider] + [shape[i] // 2 - box_size[i] // 2 for i in range(1, 3)])
+        level = np.ascontiguousarray(np.ones(box_size, dtype=int), dtype=np.int32)
+    level_list.append(level)
+    level_origins.append(neon.Index_3d(*box_origin))
 
 grid = multires_grid_factory(
     grid_shape,
     velocity_set=velocity_set,
-    sparsity_pattern_list=[level_0, level_1],
-    sparsity_pattern_origins=[neon.Index_3d(*level_origins[lvl]) for lvl in range(num_levels)],
+    sparsity_pattern_list=level_list,
+    sparsity_pattern_origins=level_origins,
 )
 
 # Define Boundary Indices
@@ -86,10 +72,10 @@ def pad_to_cube(arr):
 sphere = [tuple(indices[i]) for i in range(velocity_set.d)]
 
 # Convert bc indices to a list of list (first entry corresponds to the finest level)
-inlet = [[], inlet]
-outlet = [[], outlet]
-walls = [[], walls]
-sphere = [sphere, []]
+inlet = [[] for _ in range(num_levels - 1)] + [inlet]
+outlet = [[] for _ in range(num_levels - 1)] + [outlet]
+walls = [[] for _ in range(num_levels - 1)] + [walls]
+sphere = [sphere] + [[] for _ in range(num_levels - 1)]
 
 
 # Define Boundary Conditions
@@ -97,8 +83,8 @@ def bc_profile():
     assert compute_backend == ComputeBackend.NEON
 
     # Note nx, ny, nz are the dimensions of the grid at the finest level
-    H_y = float(ny // 2 - 1)  # Height in y direction
-    H_z = float(nz // 2 - 1)  # Height in z direction
+    H_y = float(ny // 2 ** (num_levels - 1) - 1)  # Height in y direction
+    H_z = float(nz // 2 ** (num_levels - 1) - 1)  # Height in z direction
 
     @wp.func
     def bc_profile_warp(index: wp.vec3i):
diff --git a/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
index 2d1844de..f71bbc07 100644
--- a/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
@@ -11,24 +11,16 @@
 import time
 
 
-def generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part):
+def generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part, grid_shape):
     """
     Generate a cuboid mesh based on the provided voxel size and domain multipliers.
     """
     import open3d as o3d
     import os
 
-    # Domain multipliers for each refinement level
-    # First entry should be full domain size
-    domainMultiplier = [
-        # [15, 15, 7, 7, 7, 7],  # -x, x, -y, y, -z, z
-        [6, 8, 5, 5, 5, 5],  # -x, x, -y, y, -z, z
-        [4, 6, 4, 4, 4, 4],
-        # [2, 4, 2, 2, 2, 2],
-        # [1, 2, 1, 1, 1, 1],
-        # [0.4, 1, 0.4, 0.4, 0.4, 0.4],
-        # [0.2, 0.4, 0.2, 0.2, 0.2, 0.2],
-    ]
+    # STL position
+    nx, ny, nz = grid_shape
+    sphere_origin = (nx // 6, ny // 2, nz // 2)
 
     # Load the mesh
     mesh = o3d.io.read_triangle_mesh(stl_filename)
@@ -40,74 +32,51 @@ def generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part):
     min_bound = aabb.get_min_bound()
     max_bound = aabb.get_max_bound()
     partSize = max_bound - min_bound
+    sphere_diameter_phys_units = float(min(partSize))
 
     # smallest voxel size
-    voxel_size = min(partSize) / num_finest_voxels_across_part
-
-    # Infer the finest grid shape from the first entry of the domainMultiplier
-    nx_finest = (domainMultiplier[0][0] + domainMultiplier[0][1]) * num_finest_voxels_across_part
-    ny_finest = (domainMultiplier[0][2] + domainMultiplier[0][3]) * num_finest_voxels_across_part
-    nz_finest = (domainMultiplier[0][4] + domainMultiplier[0][5]) * num_finest_voxels_across_part
-    grid_shape_finest = (nx_finest, ny_finest, nz_finest)
+    voxel_size = sphere_diameter_phys_units / num_finest_voxels_across_part
+    sphere_radius = sphere_diameter_phys_units / voxel_size / 2.0
 
     # Compute translation to put mesh into first octant of that domain—
-    shift = np.array(
-        [
-            domainMultiplier[0][0] * partSize[0] - min_bound[0],
-            domainMultiplier[0][2] * partSize[1] - min_bound[1],
-            domainMultiplier[0][4] * partSize[2] - min_bound[2],
-        ],
-        dtype=float,
-    )
+    shift = np.array(sphere_origin) * voxel_size - sphere_diameter_phys_units / 2.0 - min_bound
 
     # Apply translation and save out temp stl
     mesh.translate(shift)
     mesh.compute_vertex_normals()
     mesh_vertices = np.asarray(mesh.vertices) / voxel_size
     o3d.io.write_triangle_mesh("temp.stl", mesh)
-
-    # Mesh base don temp stl
-    level_data = make_cuboid_mesh(voxel_size, domainMultiplier, "temp.stl")
     os.remove("temp.stl")
 
-    return level_data, mesh_vertices, grid_shape_finest
-
-
-def prepare_sparsity_pattern(level_data):
-    """
-    Prepare the sparsity pattern for the multiresolution grid based on the level data. "level_data" is expected to be formatted as in
-    the output of "make_cuboid_mesh".
-    """
-
-    def pad_to_cube(arr):
-        shape = arr.shape
-        max_dim = max(shape)
-        pad_width = []
-        for dim in shape:
-            total_pad = max_dim - dim
-            pad_width.append((0, total_pad))
-        return np.pad(arr, pad_width, mode="constant", constant_values=0)
-
-    num_levels = len(level_data)
-    sparsity_pattern = []
+    # Mesh base don temp stl
+    # Create the multires grid
+    num_levels = 3
     level_origins = []
+    level_data = []
     for lvl in range(num_levels):
-        # Get the level mask from the level data
-        level_mask = level_data[lvl][0]
-
-        # Ensure level_0 is contiguous int32
-        level_mask = np.ascontiguousarray(level_mask, dtype=np.int32)
-
-        # Pad level to be cubes (TODO: this is a hack, the inner box should be a cube for now!)
-        level_mask = pad_to_cube(level_mask)
-
-        # Append the padded level mask to the sparsity pattern
-        sparsity_pattern.append(level_mask)
-
-        # Get the origin for this level
-        level_origins.append(level_data[lvl][2])
-
-    return sparsity_pattern, level_origins
+        divider = 2**lvl
+        growth = 1.25**lvl
+        shape = nx // divider, ny // divider, nz // divider
+        if lvl == num_levels - 1:
+            level = np.ascontiguousarray(np.ones(shape, dtype=int), dtype=np.int32)
+            box_origin = (0, 0, 0)  # The coarsest level has no origin offset
+        else:
+            box_size = tuple([int(shape[i] // 4 * growth) for i in range(3)])
+            if lvl == 0:
+                box_origin = tuple(
+                    [sphere_origin[0] // divider - int(2 * growth * sphere_radius // divider)]
+                    + [shape[i] // 2 - box_size[i] // 2 for i in range(1, 3)]
+                )
+            else:
+                finer_box_size = level_data[-1].shape
+                finer_box_origin = np.array(level_origins[-1])
+                shift = np.array(box_size) - np.array(finer_box_size) // 2
+                box_origin = finer_box_origin // 2 - shift // 2
+            level = np.ascontiguousarray(np.ones(box_size, dtype=int), dtype=np.int32)
+        level_data.append(level)
+        level_origins.append(box_origin)
+
+    return level_data, level_origins, mesh_vertices
 
 
 # -------------------------- Simulation Setup --------------------------
@@ -117,9 +86,10 @@ def pad_to_cube(arr):
 
 # Other setup parameters
 Re = 500.0
+grid_shape = (512 // 2, 128 // 2, 128 // 2)
 compute_backend = ComputeBackend.NEON
 precision_policy = PrecisionPolicy.FP32FP32
-velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=compute_backend)
+velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=compute_backend)
 u_max = 0.04
 num_steps = 1000
 post_process_interval = 100
@@ -133,19 +103,17 @@ def pad_to_cube(arr):
 
 # Generate the cuboid mesh and sphere vertices
 stl_filename = "examples/cfd/stl-files/sphere.stl"
-level_data, sphere, grid_shape_finest = generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part)
-
-# Convert level data to the format expected by multires_grid_factory
-sparsity_pattern, level_origins = prepare_sparsity_pattern(level_data)
+level_data, level_origins, sphere = generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part, grid_shape)
 
-# Create the multiresolution grid
+# get the number of levels
 num_levels = len(level_data)
 
+# Create the multires grid
 grid = multires_grid_factory(
-    grid_shape_finest,
+    grid_shape,
     velocity_set=velocity_set,
-    sparsity_pattern_list=sparsity_pattern,
-    sparsity_pattern_origins=[neon.Index_3d(*level_origins[lvl]) for lvl in range(num_levels)],
+    sparsity_pattern_list=level_data,
+    sparsity_pattern_origins=[neon.Index_3d(*box_origin) for box_origin in level_origins],
 )
 
 # Define Boundary Indices
@@ -163,7 +131,7 @@ def bc_profile():
     assert compute_backend == ComputeBackend.NEON
 
     # Note nx, ny, nz are the dimensions of the grid at the finest level while the inlet is defined at the coarsest level
-    nx, ny, nz = grid_shape_finest
+    nx, ny, nz = grid_shape
     H_y = float(ny // 2 ** (num_levels - 1) - 1)  # Height in y direction
     H_z = float(nz // 2 ** (num_levels - 1) - 1)  # Height in z direction
 
@@ -208,7 +176,7 @@ def bc_profile_warp(index: wp.vec3i):
     omega=omega,
     grid=grid,
     boundary_conditions=boundary_conditions,
-    collision_type="BGK",
+    collision_type="KBC",
 )
 
 # -------------------------- Simulation Loop --------------------------

From 153e16aa70b5f481d31c7d4888c4c2a21e37f6a7 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 27 May 2025 14:08:32 -0400
Subject: [PATCH 070/208] Added new hybrid methods to handle stationary and
 moving boundaries.

---
 examples/cfd/rotating_sphere_3d.py            | 366 +++++++
 examples/cfd/windtunnel_3d.py                 |  35 +-
 xlb/operator/boundary_condition/__init__.py   |   2 +-
 .../boundary_condition/bc_do_nothing.py       |   3 +
 .../boundary_condition/bc_equilibrium.py      |   3 +
 .../bc_extrapolation_outflow.py               |  22 +-
 .../bc_fullway_bounce_back.py                 |   3 +
 .../bc_grads_approximation.py                 | 321 ------
 .../bc_halfway_bounce_back.py                 |  61 +-
 xlb/operator/boundary_condition/bc_hybrid.py  | 296 ++++++
 .../boundary_condition/bc_regularized.py      |   9 +-
 xlb/operator/boundary_condition/bc_zouhe.py   |  18 +-
 .../boundary_condition/boundary_condition.py  |  54 +-
 .../boundary_condition/helper_functions_bc.py | 186 +++-
 xlb/operator/boundary_masker/__init__.py      |   5 +
 xlb/operator/boundary_masker/aabb.py          | 130 +++
 xlb/operator/boundary_masker/aabb_fill.py     | 281 +++++
 .../indices_boundary_masker.py                |  20 +-
 .../boundary_masker/mesh_boundary_masker.py   | 169 +--
 .../mesh_voxelization_method.py               |  10 +
 xlb/operator/boundary_masker/ray.py           | 107 ++
 xlb/operator/boundary_masker/winding.py       | 145 +++
 xlb/operator/force/momentum_transfer.py       |   2 +-
 xlb/operator/macroscopic/first_moment.py      |  34 +-
 xlb/operator/macroscopic/zero_moment.py       |  20 +-
 xlb/operator/stepper/nse_stepper.py           |  87 +-
 xlb/utils/__init__.py                         |  23 +-
 xlb/utils/utils.py                            | 967 ++++++++++++------
 28 files changed, 2552 insertions(+), 827 deletions(-)
 create mode 100644 examples/cfd/rotating_sphere_3d.py
 delete mode 100644 xlb/operator/boundary_condition/bc_grads_approximation.py
 create mode 100644 xlb/operator/boundary_condition/bc_hybrid.py
 create mode 100644 xlb/operator/boundary_masker/aabb.py
 create mode 100644 xlb/operator/boundary_masker/aabb_fill.py
 create mode 100644 xlb/operator/boundary_masker/mesh_voxelization_method.py
 create mode 100644 xlb/operator/boundary_masker/ray.py
 create mode 100644 xlb/operator/boundary_masker/winding.py

diff --git a/examples/cfd/rotating_sphere_3d.py b/examples/cfd/rotating_sphere_3d.py
new file mode 100644
index 00000000..671feb90
--- /dev/null
+++ b/examples/cfd/rotating_sphere_3d.py
@@ -0,0 +1,366 @@
+import xlb
+import trimesh
+import time
+from xlb.compute_backend import ComputeBackend
+from xlb.precision_policy import PrecisionPolicy
+from xlb.grid import grid_factory
+from xlb.operator.stepper import IncompressibleNavierStokesStepper
+from xlb.operator.boundary_condition import (
+    HalfwayBounceBackBC,
+    FullwayBounceBackBC,
+    RegularizedBC,
+    DoNothingBC,
+    HybridBC,
+)
+from xlb.operator.force.momentum_transfer import MomentumTransfer
+from xlb.operator.macroscopic import Macroscopic
+from xlb.utils import save_fields_vtk, save_image
+import warp as wp
+import numpy as np
+import jax.numpy as jnp
+import matplotlib.pyplot as plt
+from xlb.operator.equilibrium import QuadraticEquilibrium
+from xlb.operator import Operator
+from typing import Any
+from xlb.velocity_set.velocity_set import VelocitySet
+
+
+# -------------------------- Simulation Setup --------------------------
+
+# Grid parameters
+wp.clear_kernel_cache()
+diam = 32
+grid_size_x, grid_size_y, grid_size_z = 10 * diam, 7 * diam, 7 * diam
+grid_shape = (grid_size_x, grid_size_y, grid_size_z)
+
+# Simulation Configuration
+compute_backend = ComputeBackend.WARP
+precision_policy = PrecisionPolicy.FP32FP32
+
+velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=compute_backend)
+wind_speed = 0.04
+num_steps = 100000
+print_interval = 1000
+post_process_interval = 1000
+
+# Physical Parameters
+Re = 200.0
+visc = wind_speed * diam / Re
+omega = 1.0 / (3.0 * visc + 0.5)
+
+# Rotational speed parameters (see [1] which discusses the problem in terms of 2 non-dimensional parameters: Re and Omega)
+# [1] J. Fluid Mech. (2016), vol. 807, pp. 62–86. c© Cambridge University Press 2016 doi:10.1017/jfm.2016.596
+# \Omega = \omega * D / (2 U_\infty) where Omega is non-dimensional and omega is dimensional.
+rot_rate_nondim = -0.2
+rot_rate = 2.0 * wind_speed * rot_rate_nondim / diam
+
+# Print simulation info
+print("\n" + "=" * 50 + "\n")
+print("Simulation Configuration:")
+print(f"Grid size: {grid_size_x} x {grid_size_y} x {grid_size_z}")
+print(f"Backend: {compute_backend}")
+print(f"Velocity set: {velocity_set}")
+print(f"Precision policy: {precision_policy}")
+print(f"Prescribed velocity: {wind_speed}")
+print(f"Reynolds number: {Re}")
+print(f"Max iterations: {num_steps}")
+print("\n" + "=" * 50 + "\n")
+
+# Initialize XLB
+xlb.init(
+    velocity_set=velocity_set,
+    default_backend=compute_backend,
+    default_precision_policy=precision_policy,
+)
+
+# Create Grid
+grid = grid_factory(grid_shape, compute_backend=compute_backend)
+
+# Bounding box indices
+box = grid.bounding_box_indices()
+box_no_edge = grid.bounding_box_indices(remove_edges=True)
+inlet = box_no_edge["left"]
+outlet = box["right"]
+walls = [box["bottom"][i] + box["top"][i] + box["front"][i] + box["back"][i] for i in range(velocity_set.d)]
+walls = np.unique(np.array(walls), axis=-1).tolist()
+
+# Load the mesh (replace with your own mesh)
+stl_filename = "examples/cfd/stl-files/sphere.stl"
+mesh = trimesh.load_mesh(stl_filename, process=False)
+mesh_vertices = mesh.vertices
+
+# Transform the mesh points to be located in the right position in the wind tunnel
+mesh_vertices -= mesh_vertices.min(axis=0)
+mesh_extents = mesh_vertices.max(axis=0)
+length_phys_unit = mesh_extents.max()
+length_lbm_unit = grid_shape[1] / 7
+dx = length_phys_unit / length_lbm_unit
+mesh_vertices = mesh_vertices / dx
+shift = np.array([grid_shape[0] / 3, (grid_shape[1] - mesh_extents[1] / dx) / 2, (grid_shape[2] - mesh_extents[2] / dx) / 2])
+sphere = mesh_vertices + shift
+diam = np.max(sphere.max(axis=0) - sphere.min(axis=0))
+sphere_cross_section = np.pi * diam**2 / 4.0
+
+
+# Define rotating boundary profile
+def bc_profile():
+    _u_vec = wp.vec(velocity_set.d, dtype=precision_policy.compute_precision.wp_dtype)
+    angular_velocity = _u_vec(0.0, rot_rate, 0.0)
+    origin_np = shift + diam / 2
+    origin_wp = _u_vec(origin_np[0], origin_np[1], origin_np[2])
+
+    @wp.func
+    def bc_profile_warp(index: wp.vec3i, time: Any):
+        x = wp.float32(index[0])
+        y = wp.float32(index[1])
+        z = wp.float32(index[2])
+        surface_coord = _u_vec(x, y, z) - origin_wp
+        return wp.cross(angular_velocity, surface_coord)
+
+    return bc_profile_warp
+
+
+# Define boundary conditions
+bc_left = RegularizedBC("velocity", prescribed_value=(wind_speed, 0.0, 0.0), indices=inlet)
+bc_do_nothing = DoNothingBC(indices=outlet)
+# bc_sphere = HalfwayBounceBackBC(mesh_vertices=sphere, voxelization_method="ray", profile=bc_profile())
+bc_sphere = HybridBC(
+    bc_method="nonequilibrium_regularized", mesh_vertices=sphere, use_mesh_distance=True, voxelization_method="ray", profile=bc_profile()
+)
+# Not assining BC for walls makes them periodic.
+boundary_conditions = [bc_left, bc_do_nothing, bc_sphere]
+
+
+# Setup Stepper
+stepper = IncompressibleNavierStokesStepper(
+    grid=grid,
+    boundary_conditions=boundary_conditions,
+    collision_type="KBC",
+)
+
+
+# Defining an initializer for outlet only
+class OutletInitializer(Operator):
+    def __init__(
+        self,
+        wind_speed=None,
+        grid_shape=None,
+        velocity_set: VelocitySet = None,
+        precision_policy=None,
+        compute_backend=None,
+    ):
+        self.wind_speed = wind_speed
+        self.rho = 1.0
+        self.grid_shape = grid_shape
+        self.equilibrium = QuadraticEquilibrium(velocity_set=velocity_set, precision_policy=precision_policy, compute_backend=compute_backend)
+        super().__init__(velocity_set, precision_policy, compute_backend)
+
+    def _construct_warp(self):
+        nx, ny, nz = self.grid_shape
+        _q = self.velocity_set.q
+        _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
+        _rho = self.compute_dtype(self.rho)
+        _u = _u_vec(self.wind_speed, 0.0, 0.0)
+        _w = self.velocity_set.w
+
+        # Construct the warp kernel
+        @wp.kernel
+        def kernel(f: wp.array4d(dtype=Any)):
+            # Get the global index
+            i, j, k = wp.tid()
+            index = wp.vec3i(i, j, k)
+
+            # Set the velocity at the outlet (i.e. where i = nx-1)
+            if index[0] == nx - 1:
+                _feq = self.equilibrium.warp_functional(_rho, _u)
+                for l in range(_q):
+                    f[l, index[0], index[1], index[2]] = _feq[l]
+            else:
+                # In the rest of the domain, we assume zero velocity and equilibrium distribution.
+                for l in range(_q):
+                    f[l, index[0], index[1], index[2]] = _w[l]
+
+        return None, kernel
+
+    @Operator.register_backend(xlb.ComputeBackend.WARP)
+    def warp_implementation(self, f):
+        # Launch the warp kernel
+        wp.launch(
+            self.warp_kernel,
+            inputs=[
+                f,
+            ],
+            dim=f.shape[1:],
+        )
+        return f
+
+
+# Make initializer operator
+initializer = OutletInitializer(
+    wind_speed=wind_speed,
+    grid_shape=grid_shape,
+    velocity_set=velocity_set,
+    precision_policy=precision_policy,
+    compute_backend=compute_backend,
+)
+
+# Prepare Fields
+f_0, f_1, bc_mask, missing_mask = stepper.prepare_fields(initializer=initializer)
+
+
+# -------------------------- Helper Functions --------------------------
+
+
+def plot_coefficient(time_steps, coefficients, prefix="drag"):
+    """
+    Plot the drag coefficient with various moving averages.
+
+    Args:
+        time_steps (list): List of time steps.
+        coefficients (list): List of force coefficients.
+    """
+    # Convert lists to numpy arrays for processing
+    time_steps_np = np.array(time_steps)
+    coefficients_np = np.array(coefficients)
+
+    # Define moving average windows
+    windows = [10, 100, 1000, 10000, 100000]
+    labels = ["MA 10", "MA 100", "MA 1,000", "MA 10,000", "MA 100,000"]
+
+    plt.figure(figsize=(12, 8))
+    plt.plot(time_steps_np, coefficients_np, label="Raw", alpha=0.5)
+
+    for window, label in zip(windows, labels):
+        if len(coefficients_np) >= window:
+            ma = np.convolve(coefficients_np, np.ones(window) / window, mode="valid")
+            plt.plot(time_steps_np[window - 1 :], ma, label=label)
+
+    plt.ylim(-1.0, 1.0)
+    plt.legend()
+    plt.xlabel("Time step")
+    plt.ylabel("Drag coefficient")
+    plt.title("Drag Coefficient Over Time with Moving Averages")
+    plt.savefig(prefix + "_ma.png")
+    plt.close()
+
+
+def post_process(
+    step,
+    f_0,
+    f_1,
+    grid_shape,
+    macro,
+    momentum_transfer,
+    missing_mask,
+    bc_mask,
+    wind_speed,
+    car_cross_section,
+    drag_coefficients,
+    lift_coefficients,
+    time_steps,
+):
+    """
+    Post-process simulation data: save fields, compute forces, and plot drag coefficient.
+
+    Args:
+        step (int): Current time step.
+        f_current: Current distribution function.
+        grid_shape (tuple): Shape of the grid.
+        macro: Macroscopic operator object.
+        momentum_transfer: MomentumTransfer operator object.
+        missing_mask: Missing mask from stepper.
+        bc_mask: Boundary condition mask from stepper.
+        wind_speed (float): Prescribed wind speed.
+        car_cross_section (float): Cross-sectional area of the car.
+        drag_coefficients (list): List to store drag coefficients.
+        lift_coefficients (list): List to store lift coefficients.
+        time_steps (list): List to store time steps.
+    """
+    wp.synchronize()
+    # Convert to JAX array if necessary
+    if not isinstance(f_0, jnp.ndarray):
+        f_0_jax = wp.to_jax(f_0)
+    else:
+        f_0_jax = f_0
+
+    # Compute macroscopic quantities
+    rho, u = macro(f_0_jax)
+
+    # Remove boundary cells
+    u = u[:, 1:-1, 1:-1, 1:-1]
+    u_magnitude = jnp.sqrt(u[0] ** 2 + u[1] ** 2 + u[2] ** 2)
+
+    fields = {"ux": u[0], "uy": u[1], "uz": u[2], "u_magnitude": u_magnitude}
+
+    # Save fields in VTK format
+    # save_fields_vtk(fields, timestep=step)
+
+    # Save the u_magnitude slice at the mid y-plane
+    mid_y = grid_shape[1] // 2
+    save_image(fields["u_magnitude"][:, mid_y, :], timestep=step)
+
+    # Compute lift and drag
+    boundary_force = momentum_transfer(f_0, f_1, bc_mask, missing_mask)
+    drag = boundary_force[0]  # x-direction
+    lift = boundary_force[2]
+    cd = 2.0 * drag / (wind_speed**2 * car_cross_section)
+    cl = 2.0 * lift / (wind_speed**2 * car_cross_section)
+    print(f"CD={cd}, CL={cl}")
+    drag_coefficients.append(cd)
+    lift_coefficients.append(cl)
+    time_steps.append(step)
+
+    # Plot drag coefficient
+    plot_coefficient(time_steps, drag_coefficients, prefix="drag")
+    plot_coefficient(time_steps, lift_coefficients, prefix="lift")
+
+
+# Setup Momentum Transfer for Force Calculation
+bc_car = boundary_conditions[-1]
+momentum_transfer = MomentumTransfer(bc_car, compute_backend=compute_backend)
+
+# Define Macroscopic Calculation
+macro = Macroscopic(
+    compute_backend=ComputeBackend.JAX,
+    precision_policy=precision_policy,
+    velocity_set=xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=ComputeBackend.JAX),
+)
+
+# Initialize Lists to Store Coefficients and Time Steps
+time_steps = []
+drag_coefficients = []
+lift_coefficients = []
+
+# -------------------------- Simulation Loop --------------------------
+
+start_time = time.time()
+for step in range(num_steps):
+    # Perform simulation step
+    f_0, f_1 = stepper(f_0, f_1, bc_mask, missing_mask, omega, step)
+    f_0, f_1 = f_1, f_0  # Swap the buffers
+
+    # Print progress at intervals
+    if step % print_interval == 0:
+        elapsed_time = time.time() - start_time
+        print(f"Iteration: {step}/{num_steps} | Time elapsed: {elapsed_time:.2f}s")
+        start_time = time.time()
+
+    # Post-process at intervals and final step
+    if (step % post_process_interval == 0) or (step == num_steps - 1):
+        post_process(
+            step,
+            f_0,
+            f_1,
+            grid_shape,
+            macro,
+            momentum_transfer,
+            missing_mask,
+            bc_mask,
+            wind_speed,
+            sphere_cross_section,
+            drag_coefficients,
+            lift_coefficients,
+            time_steps,
+        )
+
+print("Simulation completed successfully.")
diff --git a/examples/cfd/windtunnel_3d.py b/examples/cfd/windtunnel_3d.py
index 074570b8..c8d69aa4 100644
--- a/examples/cfd/windtunnel_3d.py
+++ b/examples/cfd/windtunnel_3d.py
@@ -10,6 +10,7 @@
     FullwayBounceBackBC,
     RegularizedBC,
     ExtrapolationOutflowBC,
+    HybridBC,
 )
 from xlb.operator.force.momentum_transfer import MomentumTransfer
 from xlb.operator.macroscopic import Macroscopic
@@ -18,7 +19,7 @@
 import numpy as np
 import jax.numpy as jnp
 import matplotlib.pyplot as plt
-
+from xlb.operator.boundary_masker import MeshVoxelizationMethod
 
 # -------------------------- Simulation Setup --------------------------
 
@@ -74,6 +75,7 @@
 
 # Load the mesh (replace with your own mesh)
 stl_filename = "../stl-files/DrivAer-Notchback.stl"
+voxelization_method = MeshVoxelizationMethod.RAY
 mesh = trimesh.load_mesh(stl_filename, process=False)
 mesh_vertices = mesh.vertices
 
@@ -84,7 +86,15 @@
 length_lbm_unit = grid_shape[0] / 4
 dx = length_phys_unit / length_lbm_unit
 mesh_vertices = mesh_vertices / dx
-shift = np.array([grid_shape[0] / 4, (grid_shape[1] - mesh_extents[1] / dx) / 2, 0.0])
+
+# Depending on the voxelization method, shift_z ensures the bottom ground does not intersect with the voxelized mesh
+# Any smaller shift value would lead to large lift computations due to the initial equilibrium distributions. Bigger
+# values would be fine but leave a gap between surfaces that are supposed to touch.
+if voxelization_method in (MeshVoxelizationMethod.RAY, MeshVoxelizationMethod.WINDING):
+    shift_z = 2
+elif voxelization_method in (MeshVoxelizationMethod.AABB, MeshVoxelizationMethod.AABB_FILL):
+    shift_z = 3
+shift = np.array([grid_shape[0] / 4, (grid_shape[1] - mesh_extents[1] / dx) / 2, shift_z])
 car_vertices = mesh_vertices + shift
 car_cross_section = np.prod(mesh_extents[1:]) / dx**2
 
@@ -92,7 +102,8 @@
 bc_left = RegularizedBC("velocity", prescribed_value=(wind_speed, 0.0, 0.0), indices=inlet)
 bc_walls = FullwayBounceBackBC(indices=walls)
 bc_do_nothing = ExtrapolationOutflowBC(indices=outlet)
-bc_car = HalfwayBounceBackBC(mesh_vertices=car_vertices)
+bc_car = HalfwayBounceBackBC(mesh_vertices=car_vertices, voxelization_method=voxelization_method)
+# bc_car = HybridBC(bc_method="nonequilibrium_regularized",  mesh_vertices=car_vertices, voxelization_method=voxelization_method, use_mesh_distance=True)
 boundary_conditions = [bc_walls, bc_left, bc_do_nothing, bc_car]
 
 
@@ -110,28 +121,28 @@
 # -------------------------- Helper Functions --------------------------
 
 
-def plot_drag_coefficient(time_steps, drag_coefficients):
+def plot_coefficient(time_steps, coefficients, prefix="drag"):
     """
     Plot the drag coefficient with various moving averages.
 
     Args:
         time_steps (list): List of time steps.
-        drag_coefficients (list): List of drag coefficients.
+        coefficients (list): List of force coefficients.
     """
     # Convert lists to numpy arrays for processing
     time_steps_np = np.array(time_steps)
-    drag_coefficients_np = np.array(drag_coefficients)
+    coefficients_np = np.array(coefficients)
 
     # Define moving average windows
     windows = [10, 100, 1000, 10000, 100000]
     labels = ["MA 10", "MA 100", "MA 1,000", "MA 10,000", "MA 100,000"]
 
     plt.figure(figsize=(12, 8))
-    plt.plot(time_steps_np, drag_coefficients_np, label="Raw", alpha=0.5)
+    plt.plot(time_steps_np, coefficients_np, label="Raw", alpha=0.5)
 
     for window, label in zip(windows, labels):
-        if len(drag_coefficients_np) >= window:
-            ma = np.convolve(drag_coefficients_np, np.ones(window) / window, mode="valid")
+        if len(coefficients_np) >= window:
+            ma = np.convolve(coefficients_np, np.ones(window) / window, mode="valid")
             plt.plot(time_steps_np[window - 1 :], ma, label=label)
 
     plt.ylim(-1.0, 1.0)
@@ -139,7 +150,7 @@ def plot_drag_coefficient(time_steps, drag_coefficients):
     plt.xlabel("Time step")
     plt.ylabel("Drag coefficient")
     plt.title("Drag Coefficient Over Time with Moving Averages")
-    plt.savefig("drag_coefficient_ma.png")
+    plt.savefig(prefix + "_ma.png")
     plt.close()
 
 
@@ -203,12 +214,14 @@ def post_process(
     lift = boundary_force[2]
     cd = 2.0 * drag / (wind_speed**2 * car_cross_section)
     cl = 2.0 * lift / (wind_speed**2 * car_cross_section)
+    print(f"CD={cd}, CL={cl}")
     drag_coefficients.append(cd)
     lift_coefficients.append(cl)
     time_steps.append(step)
 
     # Plot drag coefficient
-    plot_drag_coefficient(time_steps, drag_coefficients)
+    plot_coefficient(time_steps, drag_coefficients, prefix="drag")
+    plot_coefficient(time_steps, lift_coefficients, prefix="lift")
 
 
 # Setup Momentum Transfer for Force Calculation
diff --git a/xlb/operator/boundary_condition/__init__.py b/xlb/operator/boundary_condition/__init__.py
index 7c87f58c..e1889563 100644
--- a/xlb/operator/boundary_condition/__init__.py
+++ b/xlb/operator/boundary_condition/__init__.py
@@ -8,4 +8,4 @@
 from xlb.operator.boundary_condition.bc_zouhe import ZouHeBC
 from xlb.operator.boundary_condition.bc_regularized import RegularizedBC
 from xlb.operator.boundary_condition.bc_extrapolation_outflow import ExtrapolationOutflowBC
-from xlb.operator.boundary_condition.bc_grads_approximation import GradsApproximationBC
+from xlb.operator.boundary_condition.bc_hybrid import HybridBC
diff --git a/xlb/operator/boundary_condition/bc_do_nothing.py b/xlb/operator/boundary_condition/bc_do_nothing.py
index aeefd788..87067a1c 100644
--- a/xlb/operator/boundary_condition/bc_do_nothing.py
+++ b/xlb/operator/boundary_condition/bc_do_nothing.py
@@ -16,6 +16,7 @@
     ImplementationStep,
     BoundaryCondition,
 )
+from xlb.operator.boundary_masker.mesh_voxelization_method import MeshVoxelizationMethod
 
 
 class DoNothingBC(BoundaryCondition):
@@ -31,6 +32,7 @@ def __init__(
         compute_backend: ComputeBackend = None,
         indices=None,
         mesh_vertices=None,
+        voxelization_method: MeshVoxelizationMethod = None,
     ):
         super().__init__(
             ImplementationStep.STREAMING,
@@ -39,6 +41,7 @@ def __init__(
             compute_backend,
             indices,
             mesh_vertices,
+            voxelization_method,
         )
 
     @Operator.register_backend(ComputeBackend.JAX)
diff --git a/xlb/operator/boundary_condition/bc_equilibrium.py b/xlb/operator/boundary_condition/bc_equilibrium.py
index 85cfd653..8973046c 100644
--- a/xlb/operator/boundary_condition/bc_equilibrium.py
+++ b/xlb/operator/boundary_condition/bc_equilibrium.py
@@ -19,6 +19,7 @@
     ImplementationStep,
     BoundaryCondition,
 )
+from xlb.operator.boundary_masker.mesh_voxelization_method import MeshVoxelizationMethod
 
 
 class EquilibriumBC(BoundaryCondition):
@@ -36,6 +37,7 @@ def __init__(
         compute_backend: ComputeBackend = None,
         indices=None,
         mesh_vertices=None,
+        voxelization_method: MeshVoxelizationMethod = None,
     ):
         # Store the equilibrium information
         self.rho = rho
@@ -53,6 +55,7 @@ def __init__(
             compute_backend,
             indices,
             mesh_vertices,
+            voxelization_method,
         )
 
     @Operator.register_backend(ComputeBackend.JAX)
diff --git a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
index 884e691e..0f5b6128 100644
--- a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
+++ b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
@@ -19,6 +19,7 @@
     ImplementationStep,
     BoundaryCondition,
 )
+from xlb.operator.boundary_masker.mesh_voxelization_method import MeshVoxelizationMethod
 
 
 class ExtrapolationOutflowBC(BoundaryCondition):
@@ -42,6 +43,7 @@ def __init__(
         compute_backend: ComputeBackend = None,
         indices=None,
         mesh_vertices=None,
+        voxelization_method: MeshVoxelizationMethod = None,
     ):
         # Call the parent constructor
         super().__init__(
@@ -51,6 +53,7 @@ def __init__(
             compute_backend,
             indices,
             mesh_vertices,
+            voxelization_method,
         )
 
         # find and store the normal vector using indices
@@ -58,7 +61,7 @@ def __init__(
 
         # Unpack the two warp functionals needed for this BC!
         if self.compute_backend == ComputeBackend.WARP:
-            self.warp_functional, self.update_bc_auxilary_data = self.warp_functional
+            self.warp_functional, self.assemble_dynamic_data = self.warp_functional
 
     def _get_normal_vec(self, indices):
         # Get the frequency count and most common element directly
@@ -89,9 +92,10 @@ def _roll(self, fld, vec):
             return jnp.roll(fld, (vec[0], vec[1], vec[2]), axis=(1, 2, 3))
 
     @partial(jit, static_argnums=(0,), inline=True)
-    def update_bc_auxilary_data(self, f_pre, f_post, bc_mask, missing_mask):
+    def assemble_dynamic_data(self, f_pre, f_post, bc_mask, missing_mask):
         """
-        Update the auxilary distribution functions for the boundary condition.
+        Prepare time-dependent dynamic data for imposing the boundary condition in the next iteration after streaming.
+        We use directions that leave the domain for storing this prepared data.
         Since this function is called post-collisiotn: f_pre = f_post_stream and f_post = f_post_collision
         """
         sound_speed = 1.0 / jnp.sqrt(3.0)
@@ -102,7 +106,7 @@ def update_bc_auxilary_data(self, f_pre, f_post, bc_mask, missing_mask):
         # Roll boundary mask in the opposite of the normal vector to mask its next immediate neighbour
         neighbour = self._roll(boundary, -self.normal)
 
-        # gather post-streaming values associated with previous time-step to construct the auxilary data for BC
+        # gather post-streaming values associated with previous time-step to construct the required data for BC
         fpop = jnp.where(boundary, f_pre, f_post)
         fpop_neighbour = jnp.where(neighbour, f_pre, f_post)
 
@@ -168,7 +172,7 @@ def functional(
             return _f
 
         @wp.func
-        def update_bc_auxilary_data(
+        def assemble_dynamic_data(
             index: Any,
             timestep: Any,
             missing_mask: Any,
@@ -177,9 +181,9 @@ def update_bc_auxilary_data(
             _f_pre: Any,
             _f_post: Any,
         ):
-            # Update the auxilary data for this BC using the neighbour's populations stored in f_aux and
-            # f_pre (post-streaming values of the current voxel). We use directions that leave the domain
-            # for storing this prepared data.
+            # Prepare time-dependent dynamic data for imposing the boundary condition in the next iteration after streaming.
+            # We use directions that leave the domain for storing this prepared data.
+            # Since this function is called post-collisiotn: f_pre = f_post_stream and f_post = f_post_collision
             _f = _f_post
             nv = get_normal_vectors(missing_mask)
             for l in range(self.velocity_set.q):
@@ -196,7 +200,7 @@ def update_bc_auxilary_data(
 
         kernel = self._construct_kernel(functional)
 
-        return (functional, update_bc_auxilary_data), kernel
+        return (functional, assemble_dynamic_data), kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(self, _f_pre, _f_post, bc_mask, missing_mask):
diff --git a/xlb/operator/boundary_condition/bc_fullway_bounce_back.py b/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
index 995e2ff9..7a128035 100644
--- a/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
@@ -17,6 +17,7 @@
     BoundaryCondition,
     ImplementationStep,
 )
+from xlb.operator.boundary_masker.mesh_voxelization_method import MeshVoxelizationMethod
 
 
 class FullwayBounceBackBC(BoundaryCondition):
@@ -31,6 +32,7 @@ def __init__(
         compute_backend: ComputeBackend = None,
         indices=None,
         mesh_vertices=None,
+        voxelization_method: MeshVoxelizationMethod = None,
     ):
         super().__init__(
             ImplementationStep.COLLISION,
@@ -39,6 +41,7 @@ def __init__(
             compute_backend,
             indices,
             mesh_vertices,
+            voxelization_method,
         )
 
     @Operator.register_backend(ComputeBackend.JAX)
diff --git a/xlb/operator/boundary_condition/bc_grads_approximation.py b/xlb/operator/boundary_condition/bc_grads_approximation.py
deleted file mode 100644
index 22fbb4ec..00000000
--- a/xlb/operator/boundary_condition/bc_grads_approximation.py
+++ /dev/null
@@ -1,321 +0,0 @@
-"""
-Base class for boundary conditions in a LBM simulation.
-"""
-
-import jax.numpy as jnp
-from jax import jit
-import jax.lax as lax
-from functools import partial
-import warp as wp
-from typing import Any
-from collections import Counter
-import numpy as np
-
-from xlb.velocity_set.velocity_set import VelocitySet
-from xlb.precision_policy import PrecisionPolicy
-from xlb.compute_backend import ComputeBackend
-from xlb.operator.operator import Operator
-from xlb.operator.macroscopic import Macroscopic
-from xlb.operator.macroscopic.zero_moment import ZeroMoment
-from xlb.operator.macroscopic.second_moment import SecondMoment as MomentumFlux
-from xlb.operator.equilibrium import QuadraticEquilibrium
-from xlb.operator.boundary_condition.boundary_condition import (
-    ImplementationStep,
-    BoundaryCondition,
-)
-
-
-class GradsApproximationBC(BoundaryCondition):
-    """
-    Purpose: Using Grad's approximation to represent fpop based on macroscopic inputs used for outflow [1] and
-    Dirichlet BCs [2]
-    [1] S. Chikatamarla, S. Ansumali, and I. Karlin, "Grad's approximation for missing data in lattice Boltzmann
-        simulations", Europhys. Lett. 74, 215 (2006).
-    [2] Dorschner, B., Chikatamarla, S. S., Bösch, F., & Karlin, I. V. (2015). Grad's approximation for moving and
-        stationary walls in entropic lattice Boltzmann simulations. Journal of Computational Physics, 295, 340-354.
-
-    """
-
-    def __init__(
-        self,
-        velocity_set: VelocitySet = None,
-        precision_policy: PrecisionPolicy = None,
-        compute_backend: ComputeBackend = None,
-        indices=None,
-        mesh_vertices=None,
-    ):
-        # TODO: the input velocity must be suitably stored elesewhere when mesh is moving.
-        self.u = (0, 0, 0)
-
-        # Call the parent constructor
-        super().__init__(
-            ImplementationStep.STREAMING,
-            velocity_set,
-            precision_policy,
-            compute_backend,
-            indices,
-            mesh_vertices,
-        )
-
-        # Instantiate the operator for computing macroscopic values
-        self.macroscopic = Macroscopic()
-        self.zero_moment = ZeroMoment()
-        self.equilibrium = QuadraticEquilibrium()
-        self.momentum_flux = MomentumFlux()
-
-        # This BC needs implicit distance to the mesh
-        self.needs_mesh_distance = True
-
-        # If this BC is defined using indices, it would need padding in order to find missing directions
-        # when imposed on a geometry that is in the domain interior
-        if self.mesh_vertices is None:
-            assert self.indices is not None
-            self.needs_padding = True
-
-        # Raise error if used for 2d examples:
-        if self.velocity_set.d == 2:
-            raise NotImplementedError("This BC is not implemented in 2D!")
-
-        # if indices is not None:
-        #     # this BC would be limited to stationary boundaries
-        #     # assert mesh_vertices is None
-        # if mesh_vertices is not None:
-        #     # this BC would be applicable for stationary and moving boundaries
-        #     assert indices is None
-        #     if mesh_velocity_function is not None:
-        #         # mesh is moving and/or deforming
-
-        assert self.compute_backend == ComputeBackend.WARP, "This BC is currently only implemented with the Warp backend!"
-
-    @Operator.register_backend(ComputeBackend.JAX)
-    @partial(jit, static_argnums=(0))
-    def jax_implementation(self, f_pre, f_post, bc_mask, missing_mask):
-        # TODO
-        raise NotImplementedError(f"Operation {self.__class__.__name} not implemented in JAX!")
-        return
-
-    def _construct_warp(self):
-        # Set local variables and constants
-        _c = self.velocity_set.c
-        _q = self.velocity_set.q
-        _d = self.velocity_set.d
-        _w = self.velocity_set.w
-        _qi = self.velocity_set.qi
-        _opp_indices = self.velocity_set.opp_indices
-        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
-        _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
-        _u_wall = _u_vec(self.u[0], self.u[1], self.u[2]) if _d == 3 else _u_vec(self.u[0], self.u[1])
-        # diagonal = wp.vec3i(0, 3, 5) if _d == 3 else wp.vec2i(0, 2)
-
-        @wp.func
-        def regularize_fpop(
-            missing_mask: Any,
-            rho: Any,
-            u: Any,
-            fpop: Any,
-        ):
-            """
-            Regularizes the distribution functions by adding non-equilibrium contributions based on second moments of fpop.
-            """
-            # Compute momentum flux of off-equilibrium populations for regularization: Pi^1 = Pi^{neq}
-            feq = self.equilibrium.warp_functional(rho, u)
-            f_neq = fpop - feq
-            PiNeq = self.momentum_flux.warp_functional(f_neq)
-
-            # Compute double dot product Qi:Pi1 (where Pi1 = PiNeq)
-            nt = _d * (_d + 1) // 2
-            for l in range(_q):
-                QiPi1 = self.compute_dtype(0.0)
-                for t in range(nt):
-                    QiPi1 += _qi[l, t] * PiNeq[t]
-
-                # assign all populations based on eq 45 of Latt et al (2008)
-                # fneq ~ f^1
-                fpop1 = self.compute_dtype(4.5) * _w[l] * QiPi1
-                fpop[l] = feq[l] + fpop1
-            return fpop
-
-        @wp.func
-        def grads_approximate_fpop(
-            missing_mask: Any,
-            rho: Any,
-            u: Any,
-            f_post: Any,
-        ):
-            # Purpose: Using Grad's approximation to represent fpop based on macroscopic inputs used for outflow [1] and
-            # Dirichlet BCs [2]
-            # [1] S. Chikatax`marla, S. Ansumali, and I. Karlin, "Grad's approximation for missing data in lattice Boltzmann
-            #   simulations", Europhys. Lett. 74, 215 (2006).
-            # [2] Dorschner, B., Chikatamarla, S. S., Bösch, F., & Karlin, I. V. (2015). Grad's approximation for moving and
-            #    stationary walls in entropic lattice Boltzmann simulations. Journal of Computational Physics, 295, 340-354.
-
-            # Note: See also self.regularize_fpop function which is somewhat similar.
-
-            # Compute pressure tensor Pi using all f_post-streaming values
-            Pi = self.momentum_flux.warp_functional(f_post)
-
-            # Compute double dot product Qi:Pi1 (where Pi1 = PiNeq)
-            nt = _d * (_d + 1) // 2
-            for l in range(_q):
-                # if missing_mask[l] == wp.uint8(1):
-                QiPi = self.compute_dtype(0.0)
-                for t in range(nt):
-                    if t == 0 or t == 3 or t == 5:
-                        QiPi += _qi[l, t] * (Pi[t] - rho / self.compute_dtype(3.0))
-                    else:
-                        QiPi += _qi[l, t] * Pi[t]
-
-                # Compute c.u
-                cu = self.compute_dtype(0.0)
-                for d in range(self.velocity_set.d):
-                    if _c[d, l] == 1:
-                        cu += u[d]
-                    elif _c[d, l] == -1:
-                        cu -= u[d]
-                cu *= self.compute_dtype(3.0)
-
-                # change f_post using the Grad's approximation
-                f_post[l] = rho * _w[l] * (self.compute_dtype(1.0) + cu) + _w[l] * self.compute_dtype(4.5) * QiPi
-
-            return f_post
-
-        # Construct the functionals for this BC
-        @wp.func
-        def functional_method1(
-            index: Any,
-            timestep: Any,
-            missing_mask: Any,
-            f_0: Any,
-            f_1: Any,
-            f_pre: Any,
-            f_post: Any,
-        ):
-            # NOTE: this BC has been reformulated to become entirely local and so has differences compared to the original paper.
-            #       Here we use the current time-step populations (f_pre = f_post_collision and f_post = f_post_streaming).
-            one = self.compute_dtype(1.0)
-            for l in range(_q):
-                # If the mask is missing then take the opposite index
-                if missing_mask[l] == wp.uint8(1):
-                    # The implicit distance to the boundary or "weights" have been stored in known directions of f_1
-                    # weight = f_1[_opp_indices[l], index[0], index[1], index[2]]
-                    weight = self.compute_dtype(0.5)
-
-                    # Use differentiable interpolated BB to find f_missing:
-                    f_post[l] = ((one - weight) * f_post[_opp_indices[l]] + weight * (f_pre[l] + f_pre[_opp_indices[l]])) / (one + weight)
-
-                    # # Add contribution due to moving_wall to f_missing as is usual in regular Bouzidi BC
-                    # cu = self.compute_dtype(0.0)
-                    # for d in range(_d):
-                    #     if _c[d, l] == 1:
-                    #         cu += _u_wall[d]
-                    #     elif _c[d, l] == -1:
-                    #         cu -= _u_wall[d]
-                    # cu *= self.compute_dtype(-6.0) * _w[l]
-                    # f_post[l] += cu
-
-            # Compute density, velocity using all f_post-streaming values
-            rho, u = self.macroscopic.warp_functional(f_post)
-
-            # Compute Grad's appriximation using full equation as in Eq (10) of Dorschner et al.
-            f_post = regularize_fpop(missing_mask, rho, u, f_post)
-            # f_post = grads_approximate_fpop(missing_mask, rho, u, f_post)
-            return f_post
-
-        # Construct the functionals for this BC
-        @wp.func
-        def functional_method2(
-            index: Any,
-            timestep: Any,
-            missing_mask: Any,
-            f_0: Any,
-            f_1: Any,
-            f_pre: Any,
-            f_post: Any,
-        ):
-            # NOTE: this BC has been reformulated to become entirely local and so has differences compared to the original paper.
-            #       Here we use the current time-step populations (f_pre = f_post_collision and f_post = f_post_streaming).
-            # NOTE: f_aux should contain populations at "x_f" (see their fig 1) in the missign direction of the BC which amounts
-            #       to post-collision values being pulled from appropriate cells like ExtrapolationBC
-            #
-            # here I need to compute all terms in Eq (10)
-            # Strategy:
-            # 1) "weights" should have been stored somewhere to be used here.
-            # 2) Given "weights", "u_w" (input to the BC) and "u_f" (computed from f_aux), compute "u_target" as per Eq (14)
-            #    NOTE: in the original paper "u_target" is associated with the previous time step not current time.
-            # 3) Given "weights" use differentiable interpolated BB to find f_missing as I had before:
-            # fmissing = ((1. - weights) * f_poststreaming_iknown + weights * (f_postcollision_imissing + f_postcollision_iknown)) / (1.0 + weights)
-            # 4) Add contribution due to u_w to f_missing as is usual in regular Bouzidi BC (ie. -6.0 * self.lattice.w * jnp.dot(self.vel, c)
-            # 5) Compute rho_target = \sum(f_ibb) based on these values
-            # 6) Compute feq using feq = self.equilibrium(rho_target, u_target)
-            # 7) Compute Pi_neq and Pi_eq using all f_post-streaming values as per:
-            #       Pi_neq = self.momentum_flux(fneq) and Pi_eq = self.momentum_flux(feq)
-            # 8) Compute Grad's appriximation using full equation as in Eq (10)
-            #    NOTE: this is very similar to the regularization procedure.
-
-            _f_nbr = _f_vec()
-            u_target = _u_vec(0.0, 0.0, 0.0) if _d == 3 else _u_vec(0.0, 0.0)
-            num_missing = 0
-            one = self.compute_dtype(1.0)
-            for l in range(_q):
-                # If the mask is missing then take the opposite index
-                if missing_mask[l] == wp.uint8(1):
-                    # Find the neighbour and its velocity value
-                    for ll in range(_q):
-                        # f_0 is the post-collision values of the current time-step
-                        # Get index associated with the fluid neighbours
-                        fluid_nbr_index = type(index)()
-                        for d in range(_d):
-                            fluid_nbr_index[d] = index[d] + _c[d, l]
-                        # The following is the post-collision values of the fluid neighbor cell
-                        _f_nbr[ll] = self.compute_dtype(f_0[ll, fluid_nbr_index[0], fluid_nbr_index[1], fluid_nbr_index[2]])
-
-                    # Compute the velocity vector at the fluid neighbouring cells
-                    _, u_f = self.macroscopic.warp_functional(_f_nbr)
-
-                    # Record the number of missing directions
-                    num_missing += 1
-
-                    # The implicit distance to the boundary or "weights" have been stored in known directions of f_1
-                    weight = f_1[_opp_indices[l], index[0], index[1], index[2]]
-
-                    # Given "weights", "u_w" (input to the BC) and "u_f" (computed from f_aux), compute "u_target" as per Eq (14)
-                    for d in range(_d):
-                        u_target[d] += (weight * u_f[d] + _u_wall[d]) / (one + weight)
-
-                    # Use differentiable interpolated BB to find f_missing:
-                    f_post[l] = ((one - weight) * f_post[_opp_indices[l]] + weight * (f_pre[l] + f_pre[_opp_indices[l]])) / (one + weight)
-
-                    # Add contribution due to moving_wall to f_missing as is usual in regular Bouzidi BC
-                    cu = self.compute_dtype(0.0)
-                    for d in range(_d):
-                        if _c[d, l] == 1:
-                            cu += _u_wall[d]
-                        elif _c[d, l] == -1:
-                            cu -= _u_wall[d]
-                    cu *= self.compute_dtype(-6.0) * _w[l]
-                    f_post[l] += cu
-
-            # Compute rho_target = \sum(f_ibb) based on these values
-            rho_target = self.zero_moment.warp_functional(f_post)
-            for d in range(_d):
-                u_target[d] /= num_missing
-
-            # Compute Grad's appriximation using full equation as in Eq (10) of Dorschner et al.
-            f_post = grads_approximate_fpop(missing_mask, rho_target, u_target, f_post)
-            return f_post
-
-        functional = functional_method1
-
-        kernel = self._construct_kernel(functional)
-
-        return functional, kernel
-
-    @Operator.register_backend(ComputeBackend.WARP)
-    def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
-        # Launch the warp kernel
-        wp.launch(
-            self.warp_kernel,
-            inputs=[f_pre, f_post, bc_mask, missing_mask],
-            dim=f_pre.shape[1:],
-        )
-        return f_post
diff --git a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
index 8ede0c8b..72859c0d 100644
--- a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
@@ -7,7 +7,8 @@
 import jax.lax as lax
 from functools import partial
 import warp as wp
-from typing import Any
+from typing import Any, Union, Tuple, Callable
+import numpy as np
 
 from xlb.velocity_set.velocity_set import VelocitySet
 from xlb.precision_policy import PrecisionPolicy
@@ -16,7 +17,9 @@
 from xlb.operator.boundary_condition.boundary_condition import (
     ImplementationStep,
     BoundaryCondition,
+    HelperFunctionsBC,
 )
+from xlb.operator.boundary_masker.mesh_voxelization_method import MeshVoxelizationMethod
 
 
 class HalfwayBounceBackBC(BoundaryCondition):
@@ -33,6 +36,9 @@ def __init__(
         compute_backend: ComputeBackend = None,
         indices=None,
         mesh_vertices=None,
+        voxelization_method: MeshVoxelizationMethod = None,
+        profile: Callable = None,
+        prescribed_value: Union[float, Tuple[float, ...], np.ndarray] = None,
     ):
         # Call the parent constructor
         super().__init__(
@@ -42,11 +48,54 @@ def __init__(
             compute_backend,
             indices,
             mesh_vertices,
+            voxelization_method,
         )
 
         # This BC needs padding for finding missing directions when imposed on a geometry that is in the domain interior
         self.needs_padding = True
 
+        # This BC class accepts both constant prescribed values of velocity with keyword "prescribed_value" or
+        # velocity profiles given by keyword "profile" which must be a callable function.
+        self.profile = profile
+
+        # A flag to enable moving wall treatment when either "prescribed_value" or "profile" are provided.
+        self.needs_moving_wall_treatment = False
+
+        if (profile is not None) or (prescribed_value is not None):
+            self.needs_moving_wall_treatment = True
+
+        # Handle no-slip BCs if neither prescribed_value or profile are provided.
+        if prescribed_value is None and profile is None:
+            print(f"WARNING! Assuming no-slip condition for BC type = {self.__class__.__name__}!")
+            prescribed_value = [0] * self.velocity_set.d
+
+        # Handle prescribed value if provided
+        if prescribed_value is not None:
+            if profile is not None:
+                raise ValueError("Cannot specify both profile and prescribed_value")
+
+            # Convert input to numpy array for validation
+            if isinstance(prescribed_value, (tuple, list)):
+                prescribed_value = np.array(prescribed_value, dtype=np.float64)
+            elif isinstance(prescribed_value, np.ndarray):
+                prescribed_value = prescribed_value.astype(np.float64)
+            elif isinstance(prescribed_value, (int, float)):
+                raise ValueError("Velocity prescribed_value must be a tuple or array")
+
+            # Validate prescribed value
+            if not isinstance(prescribed_value, np.ndarray):
+                raise ValueError("Velocity prescribed_value must be an array-like")
+
+            # create a constant prescribed profile
+            # Note this BC class is only implemented in WARP.
+            prescribed_value = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)(prescribed_value)
+
+            @wp.func
+            def prescribed_profile_warp(index: wp.vec3i, time: Any):
+                return wp.vec3(prescribed_value[0], prescribed_value[1], prescribed_value[2])
+
+            self.profile = prescribed_profile_warp
+
     @Operator.register_backend(ComputeBackend.JAX)
     @partial(jit, static_argnums=(0))
     def jax_implementation(self, f_pre, f_post, bc_mask, missing_mask):
@@ -60,6 +109,9 @@ def jax_implementation(self, f_pre, f_post, bc_mask, missing_mask):
         )
 
     def _construct_warp(self):
+        # load helper functions
+        bc_helper = HelperFunctionsBC(velocity_set=self.velocity_set, precision_policy=self.precision_policy, compute_backend=self.compute_backend)
+
         # Set local constants
         _opp_indices = self.velocity_set.opp_indices
 
@@ -74,6 +126,9 @@ def functional(
             f_pre: Any,
             f_post: Any,
         ):
+            # Get wall velocity
+            u_wall = self.profile(index, timestep)
+
             # Post-streaming values are only modified at missing direction
             _f = f_post
             for l in range(self.velocity_set.q):
@@ -82,6 +137,10 @@ def functional(
                     # Get the pre-streaming distribution function in oppisite direction
                     _f[l] = f_pre[_opp_indices[l]]
 
+                    # Add contribution due to moving_wall to f_missing
+                    if wp.static(self.needs_moving_wall_treatment):
+                        _f[l] += bc_helper.moving_wall_fpop_correction(u_wall, l)
+
             return _f
 
         kernel = self._construct_kernel(functional)
diff --git a/xlb/operator/boundary_condition/bc_hybrid.py b/xlb/operator/boundary_condition/bc_hybrid.py
new file mode 100644
index 00000000..68593f6c
--- /dev/null
+++ b/xlb/operator/boundary_condition/bc_hybrid.py
@@ -0,0 +1,296 @@
+from jax import jit
+from functools import partial
+import warp as wp
+from typing import Any, Union, Tuple, Callable
+import numpy as np
+
+from xlb.velocity_set.velocity_set import VelocitySet
+from xlb.precision_policy import PrecisionPolicy
+from xlb.compute_backend import ComputeBackend
+from xlb.operator.operator import Operator
+from xlb.operator.macroscopic import Macroscopic, ZeroMoment
+from xlb.operator.macroscopic import SecondMoment as MomentumFlux
+from xlb.operator.equilibrium import QuadraticEquilibrium
+from xlb.operator.boundary_condition.boundary_condition import (
+    ImplementationStep,
+    BoundaryCondition,
+    HelperFunctionsBC,
+)
+from xlb.operator.boundary_masker.mesh_voxelization_method import MeshVoxelizationMethod
+
+
+class HybridBC(BoundaryCondition):
+    """
+    The hybrid BC methods in this boundary condition have been originally developed by H. Salehipour and are inspired from
+    various previous publications, in particular [1]. The reformulations are aimed to provide local formulations that are
+    computationally efficient and numerically stable at exessively large Reynolds numbers.
+
+    [1] Dorschner, B., Chikatamarla, S. S., Bösch, F., & Karlin, I. V. (2015). Grad's approximation for moving and
+        stationary walls in entropic lattice Boltzmann simulations. Journal of Computational Physics, 295, 340-354.
+    """
+
+    def __init__(
+        self,
+        bc_method,
+        profile: Callable = None,
+        prescribed_value: Union[float, Tuple[float, ...], np.ndarray] = None,
+        velocity_set: VelocitySet = None,
+        precision_policy: PrecisionPolicy = None,
+        compute_backend: ComputeBackend = None,
+        indices=None,
+        mesh_vertices=None,
+        voxelization_method: MeshVoxelizationMethod = None,
+        use_mesh_distance=False,
+    ):
+        assert bc_method in [
+            "bounceback_regularized",
+            "bounceback_grads",
+            "nonequilibrium_regularized",
+        ], f"type = {bc_method} not supported! Use 'bounceback_regularized', 'bounceback_grads' or 'nonequilibrium_regularized'."
+        self.bc_method = bc_method
+
+        # Call the parent constructor
+        super().__init__(
+            ImplementationStep.STREAMING,
+            velocity_set,
+            precision_policy,
+            compute_backend,
+            indices,
+            mesh_vertices,
+            voxelization_method,
+        )
+
+        # Instantiate the operator for computing macroscopic values
+        self.macroscopic = Macroscopic()
+        self.zero_moment = ZeroMoment()
+        self.equilibrium = QuadraticEquilibrium()
+
+        # This BC class accepts both constant prescribed values of velocity with keyword "prescribed_value" or
+        # velocity profiles given by keyword "profile" which must be a callable function.
+        self.profile = profile
+
+        # A flag to enable moving wall treatment when either "prescribed_value" or "profile" are provided.
+        self.needs_moving_wall_treatment = False
+
+        if (profile is not None) or (prescribed_value is not None):
+            self.needs_moving_wall_treatment = True
+
+        # Handle no-slip BCs if neither prescribed_value or profile are provided.
+        if prescribed_value is None and profile is None:
+            print(f"WARNING! Assuming no-slip condition for BC type = {self.__class__.__name__}_{self.bc_method}!")
+            prescribed_value = [0] * self.velocity_set.d
+
+        # Handle prescribed value if provided
+        if prescribed_value is not None:
+            if profile is not None:
+                raise ValueError("Cannot specify both profile and prescribed_value")
+
+            # Convert input to numpy array for validation
+            if isinstance(prescribed_value, (tuple, list)):
+                prescribed_value = np.array(prescribed_value, dtype=np.float64)
+            elif isinstance(prescribed_value, np.ndarray):
+                prescribed_value = prescribed_value.astype(np.float64)
+            elif isinstance(prescribed_value, (int, float)):
+                raise ValueError("Velocity prescribed_value must be a tuple or array")
+
+            # Validate prescribed value
+            if not isinstance(prescribed_value, np.ndarray):
+                raise ValueError("Velocity prescribed_value must be an array-like")
+
+            # create a constant prescribed profile
+            # Note this BC class is only implemented in WARP.
+            prescribed_value = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)(prescribed_value)
+
+            @wp.func
+            def prescribed_profile_warp(index: wp.vec3i, time: Any):
+                return wp.vec3(prescribed_value[0], prescribed_value[1], prescribed_value[2])
+
+            self.profile = prescribed_profile_warp
+
+        # Set whether this BC needs mesh distance
+        self.needs_mesh_distance = use_mesh_distance
+
+        # This BC needs normalized distance to the mesh
+        if self.needs_mesh_distance:
+            # This BC needs auxiliary data recovery after streaming
+            self.needs_aux_recovery = True
+
+        # If this BC is defined using indices, it would need padding in order to find missing directions
+        # when imposed on a geometry that is in the domain interior
+        if self.mesh_vertices is None:
+            assert self.indices is not None
+            assert self.needs_mesh_distance is False, 'To use mesh distance, please provide the mesh vertices using keyword "mesh_vertices"!'
+            self.needs_padding = True
+
+        # Raise error if used for 2d examples:
+        if self.velocity_set.d == 2:
+            raise NotImplementedError("This BC is not implemented in 2D!")
+
+        # if indices is not None:
+        #     # this BC would be limited to stationary boundaries
+        #     # assert mesh_vertices is None
+        # if mesh_vertices is not None:
+        #     # this BC would be applicable for stationary and moving boundaries
+        #     assert indices is None
+        #     if mesh_velocity_function is not None:
+        #         # mesh is moving and/or deforming
+
+        assert self.compute_backend == ComputeBackend.WARP, "This BC is currently only implemented with the Warp backend!"
+
+    @Operator.register_backend(ComputeBackend.JAX)
+    @partial(jit, static_argnums=(0))
+    def jax_implementation(self, f_pre, f_post, bc_mask, missing_mask):
+        # TODO
+        raise NotImplementedError(f"Operation {self.__class__.__name} not implemented in JAX!")
+        return
+
+    def _construct_warp(self):
+        # load helper functions
+        bc_helper = HelperFunctionsBC(velocity_set=self.velocity_set, precision_policy=self.precision_policy, compute_backend=self.compute_backend)
+
+        # Set local variables and constants
+        _c = self.velocity_set.c
+        _q = self.velocity_set.q
+        _d = self.velocity_set.d
+        _opp_indices = self.velocity_set.opp_indices
+        _f_vec = wp.vec(_q, dtype=self.compute_dtype)
+        _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
+        _u_wall = _u_vec(0.0, 0.0, 0.0) if _d == 3 else _u_vec(0.0, 0.0)
+
+        # Construct the functionals for this BC
+        @wp.func
+        def hybrid_bounceback_regularized(
+            index: Any,
+            timestep: Any,
+            _missing_mask: Any,
+            f_0: Any,
+            f_1: Any,
+            f_pre: Any,
+            f_post: Any,
+        ):
+            # Using regularization technique [1] to represent fpop using macroscopic values derived from interpolated bounceback scheme of [2].
+            # missing data in lattice Boltzmann.
+            # [1] Latt, J., Chopard, B., Malaspinas, O., Deville, M., Michler, A., 2008. Straight velocity
+            #     boundaries in the lattice Boltzmann method. Physical Review E 77, 056703.
+            # [2] Yu, D., Mei, R., Shyy, W., 2003. A uniﬁed boundary treatment in lattice boltzmann method,
+            #     in: 41st aerospace sciences meeting and exhibit, p. 953.
+
+            # Apply interpolated bounceback first to find missing populations at the boundary
+            u_wall = self.profile(index, timestep)
+            f_post = bc_helper.interpolated_bounceback(
+                index,
+                _missing_mask,
+                f_0,
+                f_1,
+                f_pre,
+                f_post,
+                u_wall,
+                wp.static(self.needs_moving_wall_treatment),
+                wp.static(self.needs_mesh_distance),
+            )
+
+            # Compute density, velocity using all f_post-streaming values
+            rho, u = self.macroscopic.warp_functional(f_post)
+
+            # Regularize the resulting populations
+            feq = self.equilibrium.warp_functional(rho, u)
+            f_post = bc_helper.regularize_fpop(f_post, feq)
+            return f_post
+
+        @wp.func
+        def hybrid_bounceback_grads(
+            index: Any,
+            timestep: Any,
+            _missing_mask: Any,
+            f_0: Any,
+            f_1: Any,
+            f_pre: Any,
+            f_post: Any,
+        ):
+            # Using Grad's approximation [1] to represent fpop using macroscopic values derived from interpolated bounceback scheme of [2].
+            # missing data in lattice Boltzmann.
+            # [1] Dorschner, B., Chikatamarla, S. S., Bösch, F., & Karlin, I. V. (2015). Grad's approximation for moving and
+            #    stationary walls in entropic lattice Boltzmann simulations. Journal of Computational Physics, 295, 340-354.
+            # [2] Yu, D., Mei, R., Shyy, W., 2003. A uniﬁed boundary treatment in lattice boltzmann method,
+            #     in: 41st aerospace sciences meeting and exhibit, p. 953.
+
+            # Apply interpolated bounceback first to find missing populations at the boundary
+            u_wall = self.profile(index, timestep)
+            f_post = bc_helper.interpolated_bounceback(
+                index,
+                _missing_mask,
+                f_0,
+                f_1,
+                f_pre,
+                f_post,
+                u_wall,
+                wp.static(self.needs_moving_wall_treatment),
+                wp.static(self.needs_mesh_distance),
+            )
+
+            # Compute density, velocity using all f_post-streaming values
+            rho, u = self.macroscopic.warp_functional(f_post)
+
+            # Compute Grad's appriximation using full equation as in Eq (10) of Dorschner et al.
+            f_post = bc_helper.grads_approximate_fpop(_missing_mask, rho, u, f_post)
+            return f_post
+
+        @wp.func
+        def hybrid_nonequilibrium_regularized(
+            index: Any,
+            timestep: Any,
+            _missing_mask: Any,
+            f_0: Any,
+            f_1: Any,
+            f_pre: Any,
+            f_post: Any,
+        ):
+            # This boundary condition uses the method of Tao et al (2018) [1] to get unknown populations on curved boundaries (denoted here by
+            # interpolated_nonequilibrium_bounceback method). To further stabalize this BC, we add regularization technique of [2].
+            # [1] Tao, Shi, et al. "One-point second-order curved boundary condition for lattice Boltzmann simulation of suspended particles."
+            #     Computers & Mathematics with Applications 76.7 (2018): 1593-1607.
+            # [2] Latt, J., Chopard, B., Malaspinas, O., Deville, M., Michler, A., 2008. Straight velocity
+            #     boundaries in the lattice Boltzmann method. Physical Review E 77, 056703.
+
+            # Apply interpolated bounceback first to find missing populations at the boundary
+            u_wall = self.profile(index, timestep)
+            f_post = bc_helper.interpolated_nonequilibrium_bounceback(
+                index,
+                _missing_mask,
+                f_0,
+                f_1,
+                f_pre,
+                f_post,
+                u_wall,
+                wp.static(self.needs_moving_wall_treatment),
+                wp.static(self.needs_mesh_distance),
+            )
+
+            # Compute density, velocity using all f_post-streaming values
+            rho, u = self.macroscopic.warp_functional(f_post)
+
+            # Regularize the resulting populations
+            feq = self.equilibrium.warp_functional(rho, u)
+            f_post = bc_helper.regularize_fpop(f_post, feq)
+            return f_post
+
+        if self.bc_method == "bounceback_regularized":
+            functional = hybrid_bounceback_regularized
+        elif self.bc_method == "bounceback_grads":
+            functional = hybrid_bounceback_grads
+        elif self.bc_method == "nonequilibrium_regularized":
+            functional = hybrid_nonequilibrium_regularized
+
+        kernel = self._construct_kernel(functional)
+
+        return functional, kernel
+
+    @Operator.register_backend(ComputeBackend.WARP)
+    def warp_implementation(self, f_pre, f_post, bc_mask, _missing_mask):
+        # Launch the warp kernel
+        wp.launch(
+            self.warp_kernel,
+            inputs=[f_pre, f_post, bc_mask, _missing_mask],
+            dim=f_pre.shape[1:],
+        )
+        return f_post
diff --git a/xlb/operator/boundary_condition/bc_regularized.py b/xlb/operator/boundary_condition/bc_regularized.py
index 1950fc1b..8f5ef40e 100644
--- a/xlb/operator/boundary_condition/bc_regularized.py
+++ b/xlb/operator/boundary_condition/bc_regularized.py
@@ -7,7 +7,7 @@
 import jax.lax as lax
 from functools import partial
 import warp as wp
-from typing import Any, Union, Tuple
+from typing import Any, Union, Tuple, Callable
 import numpy as np
 
 from xlb.velocity_set.velocity_set import VelocitySet
@@ -16,6 +16,7 @@
 from xlb.operator.operator import Operator
 from xlb.operator.boundary_condition import ZouHeBC, HelperFunctionsBC
 from xlb.operator.macroscopic import SecondMoment as MomentumFlux
+from xlb.operator.boundary_masker.mesh_voxelization_method import MeshVoxelizationMethod
 
 
 class RegularizedBC(ZouHeBC):
@@ -43,13 +44,14 @@ class RegularizedBC(ZouHeBC):
     def __init__(
         self,
         bc_type,
-        profile=None,
+        profile: Callable = None,
         prescribed_value: Union[float, Tuple[float, ...], np.ndarray] = None,
         velocity_set: VelocitySet = None,
         precision_policy: PrecisionPolicy = None,
         compute_backend: ComputeBackend = None,
         indices=None,
         mesh_vertices=None,
+        voxelization_method: MeshVoxelizationMethod = None,
     ):
         # Call the parent constructor
         super().__init__(
@@ -61,6 +63,7 @@ def __init__(
             compute_backend,
             indices,
             mesh_vertices,
+            voxelization_method,
         )
         self.momentum_flux = MomentumFlux()
 
@@ -150,7 +153,7 @@ def functional_velocity(
             # Find the value of u from the missing directions
             # Since we are only considering normal velocity, we only need to find one value (stored at the center of f_1)
             # Create velocity vector by multiplying the prescribed value with the normal vector
-            prescribed_value = f_1[0, index[0], index[1], index[2]]
+            prescribed_value = self.compute_dtype(f_1[0, index[0], index[1], index[2]])
             _u = -prescribed_value * normals
 
             # calculate rho
diff --git a/xlb/operator/boundary_condition/bc_zouhe.py b/xlb/operator/boundary_condition/bc_zouhe.py
index 5cad5048..992c24a4 100644
--- a/xlb/operator/boundary_condition/bc_zouhe.py
+++ b/xlb/operator/boundary_condition/bc_zouhe.py
@@ -7,7 +7,7 @@
 import jax.lax as lax
 from functools import partial
 import warp as wp
-from typing import Any, Union, Tuple
+from typing import Any, Union, Tuple, Callable
 import numpy as np
 
 from xlb.velocity_set.velocity_set import VelocitySet
@@ -20,6 +20,7 @@
 )
 from xlb.operator.boundary_condition import HelperFunctionsBC
 from xlb.operator.equilibrium import QuadraticEquilibrium
+from xlb.operator.boundary_masker.mesh_voxelization_method import MeshVoxelizationMethod
 
 
 class ZouHeBC(BoundaryCondition):
@@ -37,20 +38,20 @@ class ZouHeBC(BoundaryCondition):
     def __init__(
         self,
         bc_type,
-        profile=None,
+        profile: Callable = None,
         prescribed_value: Union[float, Tuple[float, ...], np.ndarray] = None,
         velocity_set: VelocitySet = None,
         precision_policy: PrecisionPolicy = None,
         compute_backend: ComputeBackend = None,
         indices=None,
         mesh_vertices=None,
+        voxelization_method: MeshVoxelizationMethod = None,
     ):
         # Important Note: it is critical to add id inside __init__ for this BC because different instantiations of this BC
         # may have different types (velocity or pressure).
         assert bc_type in ["velocity", "pressure"], f"type = {bc_type} not supported! Use 'pressure' or 'velocity'."
         self.bc_type = bc_type
         self.equilibrium_operator = QuadraticEquilibrium()
-        self.profile = profile
 
         # Call the parent constructor
         super().__init__(
@@ -60,8 +61,13 @@ def __init__(
             compute_backend,
             indices,
             mesh_vertices,
+            voxelization_method,
         )
 
+        # This BC class accepts both constant prescribed values of velocity with keyword "prescribed_value" or
+        # velocity profiles given by keyword "profile" which must be a callable function.
+        self.profile = profile
+
         # Handle prescribed value if provided
         if prescribed_value is not None:
             if profile is not None:
@@ -100,13 +106,13 @@ def __init__(
             self.prescribed_value = prescribed_value
             self.profile = self._create_constant_prescribed_profile()
 
-        # This BC needs auxilary data initialization before streaming
+        # This BC needs auxiliary data initialization before streaming
         self.needs_aux_init = True
 
-        # This BC needs auxilary data recovery after streaming
+        # This BC needs auxiliary data recovery after streaming
         self.needs_aux_recovery = True
 
-        # This BC needs one auxilary data for the density or normal velocity
+        # This BC needs one auxiliary data for the density or normal velocity
         self.num_of_aux_data = 1
 
         # This BC needs padding for finding missing directions when imposed on a geometry that is in the domain interior
diff --git a/xlb/operator/boundary_condition/boundary_condition.py b/xlb/operator/boundary_condition/boundary_condition.py
index 0b8a93c6..d86e989b 100644
--- a/xlb/operator/boundary_condition/boundary_condition.py
+++ b/xlb/operator/boundary_condition/boundary_condition.py
@@ -17,6 +17,7 @@
 from xlb import DefaultConfig
 from xlb.operator.boundary_condition.boundary_condition_registry import boundary_condition_registry
 from xlb.operator.boundary_condition import HelperFunctionsBC
+from xlb.operator.boundary_masker.mesh_voxelization_method import MeshVoxelizationMethod
 
 
 # Enum for implementation step
@@ -38,6 +39,7 @@ def __init__(
         compute_backend: ComputeBackend = None,
         indices=None,
         mesh_vertices=None,
+        voxelization_method: MeshVoxelizationMethod = None,
     ):
         self.id = boundary_condition_registry.register_boundary_condition(self.__class__.__name__ + "_" + str(hash(self)))
         velocity_set = velocity_set or DefaultConfig.velocity_set
@@ -57,25 +59,50 @@ def __init__(
         # when inside/outside of the geoemtry is not known
         self.needs_padding = False
 
-        # A flag for BCs that need implicit boundary distance between the grid and a mesh (to be set to True if applicable inside each BC)
+        # A flag for BCs that need normalized distance between the grid and a mesh (to be set to True if applicable inside each BC)
         self.needs_mesh_distance = False
 
-        # A flag for BCs that need auxilary data initialization before stepper
+        # A flag for BCs that need auxiliary data initialization before stepper
         self.needs_aux_init = False
 
-        # A flag to track if the BC is initialized with auxilary data
+        # A flag to track if the BC is initialized with auxiliary data
         self.is_initialized_with_aux_data = False
 
-        # Number of auxilary data needed for the BC (for prescribed values)
+        # Number of auxiliary data needed for the BC (for prescribed values)
         self.num_of_aux_data = 0
 
-        # A flag for BCs that need auxilary data recovery after streaming
+        # A flag for BCs that need auxiliary data recovery after streaming
         self.needs_aux_recovery = False
 
+        # Voxelization method. For BC's specified on a mesh, the user can specify the voxelization scheme.
+        # Currently we support three methods based on (a) aabb method (b) ray casting and (c) winding number.
+        self.voxelization_method = voxelization_method
+
+        if self.compute_backend == ComputeBackend.WARP:
+            # Set local constants TODO: This is a hack and should be fixed with warp update
+            _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
+            _missing_mask_vec = wp.vec(self.velocity_set.q, dtype=wp.uint8)  # TODO fix vec bool
+
+        @wp.func
+        def assemble_dynamic_data(
+            index: Any,
+            timestep: Any,
+            missing_mask: Any,
+            f_0: Any,
+            f_1: Any,
+            f_pre: Any,
+            f_post: Any,
+        ):
+            return f_post
+
+        # Construct some helper warp functions for getting tid data
+        if self.compute_backend == ComputeBackend.WARP:
+            self.assemble_dynamic_data = assemble_dynamic_data
+
     @partial(jit, static_argnums=(0,), inline=True)
-    def update_bc_auxilary_data(self, f_pre, f_post, bc_mask, missing_mask):
+    def assemble_dynamic_data(self, f_pre, f_post, bc_mask, missing_mask):
         """
-        A placeholder function for prepare the auxilary distribution functions for the boundary condition.
+        A placeholder function for prepare the auxiliary distribution functions for the boundary condition.
         currently being called after collision only.
         """
         return f_post
@@ -118,7 +145,7 @@ def kernel(
 
     def _construct_aux_data_init_kernel(self, functional):
         """
-        Constructs the warp kernel for the auxilary data recovery.
+        Constructs the warp kernel for the auxiliary data recovery.
         """
         bc_helper = HelperFunctionsBC(velocity_set=self.velocity_set, precision_policy=self.precision_policy, compute_backend=self.compute_backend)
 
@@ -129,7 +156,6 @@ def _construct_aux_data_init_kernel(self, functional):
         # Construct the warp kernel
         @wp.kernel
         def aux_data_init_kernel(
-            f_0: wp.array4d(dtype=Any),
             f_1: wp.array4d(dtype=Any),
             bc_mask: wp.array4d(dtype=wp.uint8),
             missing_mask: wp.array4d(dtype=wp.bool),
@@ -139,7 +165,7 @@ def aux_data_init_kernel(
             index = wp.vec3i(i, j, k)
 
             # read tid data
-            _f_0, _f_1, _boundary_id, _missing_mask = bc_helper.get_thread_data(f_0, f_1, bc_mask, missing_mask, index)
+            _, _, _boundary_id, _missing_mask = bc_helper.get_thread_data(f_1, f_1, bc_mask, missing_mask, index)
 
             # Apply the functional
             if _boundary_id == _id:
@@ -160,16 +186,16 @@ def aux_data_init_kernel(
 
         return aux_data_init_kernel
 
-    def aux_data_init(self, f_0, f_1, bc_mask, missing_mask):
+    def aux_data_init(self, f_1, bc_mask, missing_mask):
         if self.compute_backend == ComputeBackend.WARP:
             # Launch the warp kernel
             wp.launch(
                 self._construct_aux_data_init_kernel(self.profile),
-                inputs=[f_0, f_1, bc_mask, missing_mask],
-                dim=f_0.shape[1:],
+                inputs=[f_1, bc_mask, missing_mask],
+                dim=f_1.shape[1:],
             )
         elif self.compute_backend == ComputeBackend.JAX:
             # We don't use boundary aux encoding/decoding in JAX
             self.prescribed_values = self.profile()
         self.is_initialized_with_aux_data = True
-        return f_0, f_1
+        return f_1
diff --git a/xlb/operator/boundary_condition/helper_functions_bc.py b/xlb/operator/boundary_condition/helper_functions_bc.py
index 6f8e768b..0782ac96 100644
--- a/xlb/operator/boundary_condition/helper_functions_bc.py
+++ b/xlb/operator/boundary_condition/helper_functions_bc.py
@@ -1,5 +1,7 @@
 from xlb import DefaultConfig, ComputeBackend
-from xlb.operator.macroscopic.second_moment import SecondMoment as MomentumFlux
+from xlb.operator.macroscopic import SecondMoment as MomentumFlux
+from xlb.operator.macroscopic import Macroscopic
+from xlb.operator.equilibrium import QuadraticEquilibrium
 import warp as wp
 from typing import Any
 
@@ -30,6 +32,12 @@ def __init__(self, velocity_set=None, precision_policy=None, compute_backend=Non
         _f_vec = wp.vec(_q, dtype=compute_dtype)
         _missing_mask_vec = wp.vec(_q, dtype=wp.uint8)  # TODO fix vec bool
 
+        # Define the operator needed for computing equilibrium
+        equilibrium = QuadraticEquilibrium(velocity_set, precision_policy, compute_backend)
+
+        # Define the operator needed for computing macroscopic variables
+        macroscopic = Macroscopic(velocity_set, precision_policy, compute_backend)
+
         # Define the operator needed for computing the momentum flux
         momentum_flux = MomentumFlux(velocity_set, precision_policy, compute_backend)
 
@@ -61,38 +69,38 @@ def get_thread_data(
         @wp.func
         def get_bc_fsum(
             fpop: Any,
-            missing_mask: Any,
+            _missing_mask: Any,
         ):
             fsum_known = compute_dtype(0.0)
             fsum_middle = compute_dtype(0.0)
             for l in range(_q):
-                if missing_mask[_opp_indices[l]] == wp.uint8(1):
+                if _missing_mask[_opp_indices[l]] == wp.uint8(1):
                     fsum_known += compute_dtype(2.0) * fpop[l]
-                elif missing_mask[l] != wp.uint8(1):
+                elif _missing_mask[l] != wp.uint8(1):
                     fsum_middle += fpop[l]
             return fsum_known + fsum_middle
 
         @wp.func
         def get_normal_vectors(
-            missing_mask: Any,
+            _missing_mask: Any,
         ):
             if wp.static(_d == 3):
                 for l in range(_q):
-                    if missing_mask[l] == wp.uint8(1) and wp.abs(_c[0, l]) + wp.abs(_c[1, l]) + wp.abs(_c[2, l]) == 1:
+                    if _missing_mask[l] == wp.uint8(1) and wp.abs(_c[0, l]) + wp.abs(_c[1, l]) + wp.abs(_c[2, l]) == 1:
                         return -_u_vec(_c_float[0, l], _c_float[1, l], _c_float[2, l])
             else:
                 for l in range(_q):
-                    if missing_mask[l] == wp.uint8(1) and wp.abs(_c[0, l]) + wp.abs(_c[1, l]) == 1:
+                    if _missing_mask[l] == wp.uint8(1) and wp.abs(_c[0, l]) + wp.abs(_c[1, l]) == 1:
                         return -_u_vec(_c_float[0, l], _c_float[1, l])
 
         @wp.func
         def bounceback_nonequilibrium(
             fpop: Any,
             feq: Any,
-            missing_mask: Any,
+            _missing_mask: Any,
         ):
             for l in range(_q):
-                if missing_mask[l] == wp.uint8(1):
+                if _missing_mask[l] == wp.uint8(1):
                     fpop[l] = fpop[_opp_indices[l]] + feq[l] - feq[_opp_indices[l]]
             return fpop
 
@@ -121,8 +129,168 @@ def regularize_fpop(
                 fpop[l] = feq[l] + fpop1
             return fpop
 
+        @wp.func
+        def grads_approximate_fpop(
+            _missing_mask: Any,
+            rho: Any,
+            u: Any,
+            f_post: Any,
+        ):
+            # Purpose: Using Grad's approximation to represent fpop based on macroscopic inputs used for outflow [1] and
+            # Dirichlet BCs [2]
+            # [1] S. Chikatax`marla, S. Ansumali, and I. Karlin, "Grad's approximation for missing data in lattice Boltzmann
+            #   simulations", Europhys. Lett. 74, 215 (2006).
+            # [2] Dorschner, B., Chikatamarla, S. S., Bösch, F., & Karlin, I. V. (2015). Grad's approximation for moving and
+            #    stationary walls in entropic lattice Boltzmann simulations. Journal of Computational Physics, 295, 340-354.
+
+            # Note: See also self.regularize_fpop function which is somewhat similar.
+
+            # Compute pressure tensor Pi using all f_post-streaming values
+            Pi = momentum_flux.warp_functional(f_post)
+
+            # Compute double dot product Qi:Pi1 (where Pi1 = PiNeq)
+            nt = _d * (_d + 1) // 2
+            for l in range(_q):
+                if _missing_mask[l] == wp.uint8(1):
+                    # compute dot product of qi and Pi
+                    QiPi = compute_dtype(0.0)
+                    for t in range(nt):
+                        if t == 0 or t == 3 or t == 5:
+                            QiPi += _qi[l, t] * (Pi[t] - rho / compute_dtype(3.0))
+                        else:
+                            QiPi += _qi[l, t] * Pi[t]
+
+                    # Compute c.u
+                    cu = compute_dtype(0.0)
+                    for d in range(_d):
+                        if _c[d, l] == 1:
+                            cu += u[d]
+                        elif _c[d, l] == -1:
+                            cu -= u[d]
+                    cu *= compute_dtype(3.0)
+
+                    # change f_post using the Grad's approximation
+                    f_post[l] = rho * _w[l] * (compute_dtype(1.0) + cu) + _w[l] * compute_dtype(4.5) * QiPi
+
+            return f_post
+
+        @wp.func
+        def moving_wall_fpop_correction(
+            u_wall: Any,
+            lattice_direction: Any,
+        ):
+            # Add forcing term necessary to account for the local density changes caused by the mass displacement
+            # as the object moves with velocity u_wall.
+            # [1] L.-S. Luo, Unified theory of lattice Boltzmann models for nonideal gases, Phys. Rev. Lett. 81 (1998) 1618-1621.
+            # [2] L.-S. Luo, Theory of the lattice Boltzmann method: Lattice Boltzmann models for nonideal gases, Phys. Rev. E 62 (2000) 4982-4996.
+            #
+            # Note: this function must be called within a for-loop over all lattice directions and the populations to be modified must
+            # be only those in the missing direction (the check for missing direction must be outside of this function).
+            cu = compute_dtype(0.0)
+            l = lattice_direction
+            for d in range(_d):
+                if _c[d, l] == 1:
+                    cu += u_wall[d]
+                elif _c[d, l] == -1:
+                    cu -= u_wall[d]
+            cu *= compute_dtype(6.0) * _w[l]
+            return cu
+
+        @wp.func
+        def interpolated_bounceback(
+            index: Any,
+            _missing_mask: Any,
+            f_0: Any,
+            f_1: Any,
+            f_pre: Any,
+            f_post: Any,
+            u_wall: Any,
+            needs_moving_wall_treatment: bool,
+            needs_mesh_distance: bool,
+        ):
+            # A local single-node version of the interpolated bounce-back boundary condition due to Bouzidi for a lattice
+            # Boltzmann method simulation.
+            # Ref:
+            # [1] Yu, D., Mei, R., Shyy, W., 2003. A uniﬁed boundary treatment in lattice boltzmann method,
+            # in: 41st aerospace sciences meeting and exhibit, p. 953.
+
+            one = compute_dtype(1.0)
+            for l in range(_q):
+                # If the mask is missing then take the opposite index
+                if _missing_mask[l] == wp.uint8(1):
+                    # The normalized distance to the mesh or "weights" have been stored in known directions of f_1
+                    if needs_mesh_distance:
+                        # use weights associated with curved boundaries that are properly stored in f_1.
+                        weight = compute_dtype(f_1[_opp_indices[l], index[0], index[1], index[2]])
+                    else:
+                        weight = compute_dtype(0.5)
+
+                    if _missing_mask[_opp_indices[l]] == wp.uint8(0):
+                        # Use differentiable interpolated BB to find f_missing:
+                        f_post[l] = ((one - weight) * f_post[_opp_indices[l]] + weight * (f_pre[l] + f_pre[_opp_indices[l]])) / (one + weight)
+                    else:
+                        # These are cases where the boundary is sandwiched between 2 solid cells and so both opposite directions are missing.
+                        f_post[l] = f_pre[_opp_indices[l]]
+
+                    # Add contribution due to moving_wall to f_missing as is usual in regular Bouzidi BC
+                    if needs_moving_wall_treatment:
+                        f_post[l] += moving_wall_fpop_correction(u_wall, l)
+            return f_post
+
+        @wp.func
+        def interpolated_nonequilibrium_bounceback(
+            index: Any,
+            _missing_mask: Any,
+            f_0: Any,
+            f_1: Any,
+            f_pre: Any,
+            f_post: Any,
+            u_wall: Any,
+            needs_moving_wall_treatment: bool,
+            needs_mesh_distance: bool,
+        ):
+            # Compute density, velocity using all f_post-collision values
+            rho, u = macroscopic.warp_functional(f_pre)
+            feq = equilibrium.warp_functional(rho, u)
+
+            # Compute equilibrium distribution at the wall
+            if needs_moving_wall_treatment:
+                feq_wall = equilibrium.warp_functional(rho, u_wall)
+            else:
+                feq_wall = _f_vec()
+
+            # Apply method in Tao et al (2018) [1] to find missing populations at the boundary
+            one = compute_dtype(1.0)
+            for l in range(_q):
+                # If the mask is missing then take the opposite index
+                if _missing_mask[l] == wp.uint8(1):
+                    # The normalized distance to the mesh or "weights" have been stored in known directions of f_1
+                    if needs_mesh_distance:
+                        # use weights associated with curved boundaries that are properly stored in f_1.
+                        weight = compute_dtype(f_1[_opp_indices[l], index[0], index[1], index[2]])
+                    else:
+                        weight = compute_dtype(0.5)
+
+                    # Use non-equilibrium bounceback to find f_missing:
+                    fneq = f_pre[_opp_indices[l]] - feq[_opp_indices[l]]
+
+                    # Compute equilibrium distribution at the wall
+                    # Same quadratic equilibrium but accounting for zero velocity (no-slip)
+                    if not needs_moving_wall_treatment:
+                        feq_wall[l] = _w[l] * rho
+
+                    # Assemble wall population for doing interpolation at the boundary
+                    f_wall = feq_wall[l] + fneq
+                    f_post[l] = (f_wall + weight * f_pre[l]) / (one + weight)
+
+            return f_post
+
         self.get_thread_data = get_thread_data
         self.get_bc_fsum = get_bc_fsum
         self.get_normal_vectors = get_normal_vectors
         self.bounceback_nonequilibrium = bounceback_nonequilibrium
         self.regularize_fpop = regularize_fpop
+        self.grads_approximate_fpop = grads_approximate_fpop
+        self.moving_wall_fpop_correction = moving_wall_fpop_correction
+        self.interpolated_bounceback = interpolated_bounceback
+        self.interpolated_nonequilibrium_bounceback = interpolated_nonequilibrium_bounceback
diff --git a/xlb/operator/boundary_masker/__init__.py b/xlb/operator/boundary_masker/__init__.py
index 3417c3c8..c4772eca 100644
--- a/xlb/operator/boundary_masker/__init__.py
+++ b/xlb/operator/boundary_masker/__init__.py
@@ -1,2 +1,7 @@
 from xlb.operator.boundary_masker.indices_boundary_masker import IndicesBoundaryMasker
 from xlb.operator.boundary_masker.mesh_boundary_masker import MeshBoundaryMasker
+from xlb.operator.boundary_masker.aabb import MeshMaskerAABB
+from xlb.operator.boundary_masker.ray import MeshMaskerRay
+from xlb.operator.boundary_masker.winding import MeshMaskerWinding
+from xlb.operator.boundary_masker.aabb_fill import MeshMaskerAABBFill
+from xlb.operator.boundary_masker.mesh_voxelization_method import MeshVoxelizationMethod
diff --git a/xlb/operator/boundary_masker/aabb.py b/xlb/operator/boundary_masker/aabb.py
new file mode 100644
index 00000000..62297288
--- /dev/null
+++ b/xlb/operator/boundary_masker/aabb.py
@@ -0,0 +1,130 @@
+import warp as wp
+from typing import Any
+from xlb.velocity_set.velocity_set import VelocitySet
+from xlb.precision_policy import PrecisionPolicy
+from xlb.compute_backend import ComputeBackend
+from xlb.operator.boundary_masker.mesh_boundary_masker import MeshBoundaryMasker
+from xlb.operator.operator import Operator
+
+
+class MeshMaskerAABB(MeshBoundaryMasker):
+    """
+    Operator for creating a boundary missing_mask from an STL file
+    """
+
+    def __init__(
+        self,
+        velocity_set: VelocitySet,
+        precision_policy: PrecisionPolicy,
+        compute_backend: ComputeBackend.WARP,
+    ):
+        # Call super
+        super().__init__(velocity_set, precision_policy, compute_backend)
+
+    def _construct_warp(self):
+        # Make constants for warp
+        _c = self.velocity_set.c
+        _q = self.velocity_set.q
+        _opp_indices = self.velocity_set.opp_indices
+
+        # Do voxelization mesh query (warp.mesh_query_aabb) to find solid voxels
+        #  - this gives an approximate 1 voxel thick surface around mesh
+        @wp.kernel
+        def kernel(
+            mesh_id: wp.uint64,
+            id_number: wp.int32,
+            bc_mask: wp.array4d(dtype=wp.uint8),
+            missing_mask: wp.array4d(dtype=wp.bool),
+        ):
+            # get index
+            i, j, k = wp.tid()
+
+            # Get local indices
+            index = wp.vec3i(i, j, k)
+
+            # position of the point
+            pos_bc_cell = self.index_to_position(index)
+            half = wp.vec3(0.5, 0.5, 0.5)
+
+            if bc_mask[0, index[0], index[1], index[2]] == wp.uint8(255) or self.mesh_voxel_intersect(mesh_id=mesh_id, low=pos_bc_cell - half):
+                # Make solid voxel
+                bc_mask[0, index[0], index[1], index[2]] = wp.uint8(255)
+            else:
+                # Find the boundary voxels and their missing directions
+                for l in range(1, _q):
+                    _dir = wp.vec3f(wp.float32(_c[0, l]), wp.float32(_c[1, l]), wp.float32(_c[2, l]))
+
+                    # Check to see if this neighbor is solid - this is super inefficient TODO: make it way better
+                    if self.mesh_voxel_intersect(mesh_id=mesh_id, low=pos_bc_cell + _dir - half):
+                        # We know we have a solid neighbor
+                        # Set the boundary id and missing_mask
+                        bc_mask[0, index[0], index[1], index[2]] = wp.uint8(id_number)
+                        missing_mask[_opp_indices[l], index[0], index[1], index[2]] = True
+
+        @wp.kernel
+        def kernel_with_distance(
+            mesh_id: wp.uint64,
+            id_number: wp.int32,
+            distances: wp.array4d(dtype=Any),
+            bc_mask: wp.array4d(dtype=wp.uint8),
+            missing_mask: wp.array4d(dtype=wp.bool),
+        ):
+            # get index
+            i, j, k = wp.tid()
+
+            # Get local indices
+            index = wp.vec3i(i, j, k)
+
+            # position of the point
+            pos_bc_cell = self.index_to_position(index)
+            half = wp.vec3(0.5, 0.5, 0.5)
+
+            if bc_mask[0, index[0], index[1], index[2]] == wp.uint8(255) or self.mesh_voxel_intersect(mesh_id=mesh_id, low=pos_bc_cell - half):
+                # Make solid voxel
+                bc_mask[0, index[0], index[1], index[2]] = wp.uint8(255)
+            else:
+                # Find the boundary voxels and their missing directions
+                for l in range(1, _q):
+                    _dir = wp.vec3f(wp.float32(_c[0, l]), wp.float32(_c[1, l]), wp.float32(_c[2, l]))
+
+                    # Check to see if this neighbor is solid - this is super inefficient TODO: make it way better
+                    if self.mesh_voxel_intersect(mesh_id=mesh_id, low=pos_bc_cell + _dir - half):
+                        # We know we have a solid neighbor
+                        # Set the boundary id and missing_mask
+                        bc_mask[0, index[0], index[1], index[2]] = wp.uint8(id_number)
+                        missing_mask[_opp_indices[l], index[0], index[1], index[2]] = True
+
+                        # Find the fractional distance to the mesh in each direction
+                        # We increase max_length to find intersections in neighboring cells
+                        max_length = wp.length(_dir)
+                        query = wp.mesh_query_ray(mesh_id, pos_bc_cell, _dir / max_length, 1.5 * max_length)
+                        if query.result:
+                            # get position of the mesh triangle that intersects with the ray
+                            pos_mesh = wp.mesh_eval_position(mesh_id, query.face, query.u, query.v)
+                            # We reduce the distance to give some wall thickness
+                            dist = wp.length(pos_mesh - pos_bc_cell) - 0.5 * max_length
+                            weight = self.store_dtype(dist / max_length)
+                            distances[l, index[0], index[1], index[2]] = weight
+                            # if weight <= 0.0 or weight > 1.0:
+                            #     wp.printf("Got bad weight %f at %d,%d,%d\n", weight, index[0], index[1], index[2])
+                        else:
+                            # We didn't have an intersection in the given direction but we know we should so we assume the solid is slightly thicker
+                            # and one lattice direction away from the BC voxel
+                            distances[l, index[0], index[1], index[2]] = self.store_dtype(1.0)
+
+        return None, [kernel, kernel_with_distance]
+
+    @Operator.register_backend(ComputeBackend.WARP)
+    def warp_implementation(
+        self,
+        bc,
+        distances,
+        bc_mask,
+        missing_mask,
+    ):
+        return self.warp_implementation_base(
+            bc,
+            distances,
+            bc_mask,
+            missing_mask,
+        )
diff --git a/xlb/operator/boundary_masker/aabb_fill.py b/xlb/operator/boundary_masker/aabb_fill.py
new file mode 100644
index 00000000..1460f012
--- /dev/null
+++ b/xlb/operator/boundary_masker/aabb_fill.py
@@ -0,0 +1,281 @@
+# Base class for all equilibriums
+
+import numpy as np
+import warp as wp
+import jax
+from typing import Any
+from xlb.velocity_set.velocity_set import VelocitySet
+from xlb.precision_policy import PrecisionPolicy
+from xlb.compute_backend import ComputeBackend
+from xlb.operator.operator import Operator
+from xlb.operator.boundary_masker.mesh_boundary_masker import MeshBoundaryMasker
+
+
+class MeshMaskerAABBFill(MeshBoundaryMasker):
+    """
+    Operator for creating a boundary missing_mask from an STL file
+    """
+
+    def __init__(
+        self,
+        velocity_set: VelocitySet,
+        precision_policy: PrecisionPolicy,
+        compute_backend: ComputeBackend.WARP,
+        fill_in_voxels: int = 3,
+    ):
+        # Call super
+        self.tile_half = fill_in_voxels
+        self.tile_size = self.tile_half * 2 + 1
+        super().__init__(velocity_set, precision_policy, compute_backend)
+
+    def _construct_warp(self):
+        # Make constants for warp
+        _c = self.velocity_set.c
+        _q = self.velocity_set.q
+        _opp_indices = self.velocity_set.opp_indices
+        TILE_SIZE = wp.constant(self.tile_size)
+        TILE_HALF = wp.constant(self.tile_half)
+
+        # Erode the solid mask in f_field, removing a layer of outer solid voxels, storing output in f_field_out
+        @wp.kernel
+        def erode_tile(f_field: wp.array3d(dtype=Any), f_field_out: wp.array3d(dtype=Any)):
+            i, j, k = wp.tid()
+            if (
+                i < TILE_HALF
+                or i >= f_field.shape[0] - TILE_HALF
+                or j < TILE_HALF
+                or j >= f_field.shape[1] - TILE_HALF
+                or k < TILE_HALF
+                or k >= f_field.shape[2] - TILE_HALF
+            ):
+                f_field_out[i, j, k] = f_field[i, j, k]
+                return
+            t = wp.tile_load(f_field, shape=(TILE_SIZE, TILE_SIZE, TILE_SIZE), offset=(i - TILE_HALF, j - TILE_HALF, k - TILE_HALF))
+            min_val = wp.tile_min(t)
+            f_field_out[i, j, k] = min_val[0]
+
+        # Dilate the solid mask in f_field, adding a layer of outer solid voxels, storing output in f_field_out
+        @wp.kernel
+        def dilate_tile(f_field: wp.array3d(dtype=Any), f_field_out: wp.array3d(dtype=Any)):
+            i, j, k = wp.tid()
+            if (
+                i < TILE_HALF
+                or i >= f_field.shape[0] - TILE_HALF
+                or j < TILE_HALF
+                or j >= f_field.shape[1] - TILE_HALF
+                or k < TILE_HALF
+                or k >= f_field.shape[2] - TILE_HALF
+            ):
+                f_field_out[i, j, k] = f_field[i, j, k]
+                return
+            t = wp.tile_load(f_field, shape=(TILE_SIZE, TILE_SIZE, TILE_SIZE), offset=(i - TILE_HALF, j - TILE_HALF, k - TILE_HALF))
+            max_val = wp.tile_max(t)
+            f_field_out[i, j, k] = max_val[0]
+
+        # Construct the warp kernel
+        # Find solid voxels that intersect the mesh
+        @wp.kernel
+        def kernel_solid(
+            mesh_id: wp.uint64,
+            solid_mask: wp.array3d(dtype=wp.int32),
+            offset: wp.vec3f,
+        ):
+            # get index
+            i, j, k = wp.tid()
+
+            # Get local indices
+            index = wp.vec3i(i, j, k)
+
+            # position of the point
+            pos_bc_cell = self.index_to_position(index) + offset
+            half = wp.vec3(0.5, 0.5, 0.5)
+
+            if self.mesh_voxel_intersect(mesh_id=mesh_id, low=pos_bc_cell - half):
+                # Make solid voxel
+                solid_mask[index[0], index[1], index[2]] = wp.int32(255)
+
+        # Assign the bc_mask based on the solid_mask we already computed
+        @wp.kernel
+        def kernel(
+            mesh_id: wp.uint64,
+            id_number: wp.int32,
+            bc_mask: wp.array4d(dtype=wp.uint8),
+            missing_mask: wp.array4d(dtype=wp.bool),
+            solid_mask: wp.array3d(dtype=wp.uint8),
+        ):
+            # get index
+            i, j, k = wp.tid()
+
+            # Get local indices
+            index = wp.vec3i(i, j, k)
+
+            if solid_mask[i, j, k] == wp.uint8(255):
+                # Make solid voxel
+                bc_mask[0, index[0], index[1], index[2]] = wp.uint8(255)
+            else:
+                # Find the boundary voxels and their missing directions
+                for l in range(1, _q):
+                    # Check to see if this neighbor is solid - this is super inefficient TODO: make it way better
+                    if solid_mask[i + _c[0, l], j + _c[1, l], k + _c[2, l]] == wp.uint8(255):
+                        # We know we have a solid neighbor
+                        # Set the boundary id and missing_mask
+                        bc_mask[0, index[0], index[1], index[2]] = wp.uint8(id_number)
+                        missing_mask[_opp_indices[l], index[0], index[1], index[2]] = True
+
+        # Assign the bc_mask and distances based on the solid_mask we already computed
+        @wp.kernel
+        def kernel_with_distance(
+            mesh_id: wp.uint64,
+            id_number: wp.int32,
+            distances: wp.array4d(dtype=Any),
+            bc_mask: wp.array4d(dtype=wp.uint8),
+            missing_mask: wp.array4d(dtype=wp.bool),
+            solid_mask: wp.array3d(dtype=wp.uint8),
+        ):
+            # get index
+            i, j, k = wp.tid()
+
+            # Get local indices
+            index = wp.vec3i(i, j, k)
+
+            # position of the point
+            pos_bc_cell = self.index_to_position(index)
+
+            if solid_mask[i, j, k] == wp.uint8(255) or bc_mask[0, index[0], index[1], index[2]] == wp.uint8(255):
+                # Make solid voxel
+                bc_mask[0, index[0], index[1], index[2]] = wp.uint8(255)
+            else:
+                # Find the boundary voxels and their missing directions
+                for l in range(1, _q):
+                    _dir = wp.vec3f(wp.float32(_c[0, l]), wp.float32(_c[1, l]), wp.float32(_c[2, l]))
+
+                    # Check to see if this neighbor is solid - this is super inefficient TODO: make it way better
+                    # if solid_mask[i,j,k] == wp.uint8(255):
+                    if solid_mask[i + _c[0, l], j + _c[1, l], k + _c[2, l]] == wp.uint8(255):
+                        # We know we have a solid neighbor
+                        # Set the boundary id and missing_mask
+                        bc_mask[0, index[0], index[1], index[2]] = wp.uint8(id_number)
+                        missing_mask[_opp_indices[l], index[0], index[1], index[2]] = True
+
+                        # Find the fractional distance to the mesh in each direction
+                        # We increase max_length to find intersections in neighboring cells
+                        max_length = wp.length(_dir)
+                        query = wp.mesh_query_ray(mesh_id, pos_bc_cell, _dir / max_length, 1.5 * max_length)
+                        if query.result:
+                            # get position of the mesh triangle that intersects with the ray
+                            pos_mesh = wp.mesh_eval_position(mesh_id, query.face, query.u, query.v)
+                            # We reduce the distance to give some wall thickness
+                            dist = wp.length(pos_mesh - pos_bc_cell) - 0.5 * max_length
+                            weight = self.store_dtype(dist / max_length)
+                            distances[l, index[0], index[1], index[2]] = weight
+                            # if weight <= 0.0 or weight > 1.0:
+                            #     wp.printf("Got bad weight %f at %d,%d,%d\n", weight, index[0], index[1], index[2])
+                        else:
+                            # We didn't have an intersection in the given direction but we know we should so we assume the solid is slightly thicker
+                            # and one lattice direction away from the BC voxel
+                            distances[l, index[0], index[1], index[2]] = self.store_dtype(1.0)
+
+        kernel_dict = {
+            "kernel": kernel,
+            "kernel_with_distance": kernel_with_distance,
+            "kernel_solid": kernel_solid,
+            "erode_tile": erode_tile,
+            "dilate_tile": dilate_tile,
+        }
+        return None, kernel_dict
+
+    @Operator.register_backend(ComputeBackend.WARP)
+    def warp_implementation(
+        self,
+        bc,
+        distances,
+        bc_mask,
+        missing_mask,
+    ):
+        assert bc.mesh_vertices is not None, f'Please provide the mesh vertices for {bc.__class__.__name__} BC using keyword "mesh_vertices"!'
+        assert bc.indices is None, f"Please use IndicesBoundaryMasker operator if {bc.__class__.__name__} is imposed on known indices of the grid!"
+        assert bc.mesh_vertices.shape[1] == self.velocity_set.d, (
+            "Mesh points must be reshaped into an array (N, 3) where N indicates number of points!"
+        )
+
+        domain_shape = bc_mask.shape[1:]  # (nx, ny, nz)
+        mesh_vertices = bc.mesh_vertices
+        mesh_min = np.min(mesh_vertices, axis=0)
+        mesh_max = np.max(mesh_vertices, axis=0)
+
+        if any(mesh_min < 0) or any(mesh_max >= domain_shape):
+            raise ValueError(
+                f"Mesh extents ({mesh_min}, {mesh_max}) exceed domain dimensions {domain_shape}. The mesh must be fully contained within the domain."
+            )
+
+        # We are done with bc.mesh_vertices. Remove them from BC objects
+        bc.__dict__.pop("mesh_vertices", None)
+
+        mesh_indices = np.arange(mesh_vertices.shape[0])
+        mesh = wp.Mesh(
+            points=wp.array(mesh_vertices, dtype=wp.vec3),
+            indices=wp.array(mesh_indices, dtype=wp.int32),
+        )
+        mesh_id = wp.uint64(mesh.id)
+        bc_id = bc.id
+
+        # Create a padded mask for the solid voxels to account for the tile size
+        # It needs to be padded by twice the tile size on each side since we run two tile operations
+        tile_length = 2 * self.tile_half
+        offset = wp.vec3f(-tile_length, -tile_length, -tile_length)
+        pad = 2 * tile_length
+        nx, ny, nz = domain_shape
+        solid_mask = wp.zeros((nx + pad, ny + pad, nz + pad), dtype=wp.int32)
+        solid_mask_out = wp.zeros((nx + pad, ny + pad, nz + pad), dtype=wp.int32)
+
+        # Prepare the warp kernel dictionary
+        kernel_dict = self.warp_kernel
+
+        # Launch all required kernels for creating the solid mask
+        wp.launch(
+            kernel=kernel_dict["kernel_solid"],
+            inputs=[
+                mesh_id,
+                solid_mask,
+                offset,
+            ],
+            dim=solid_mask.shape,
+        )
+        wp.launch_tiled(
+            kernel=kernel_dict["dilate_tile"],
+            dim=solid_mask.shape,
+            block_dim=32,
+            inputs=[solid_mask, solid_mask_out],
+        )
+        wp.launch_tiled(
+            kernel=kernel_dict["erode_tile"],
+            dim=solid_mask.shape,
+            block_dim=32,
+            inputs=[solid_mask_out, solid_mask],
+        )
+        solid_mask_cropped = wp.array(
+            solid_mask[tile_length:-tile_length, tile_length:-tile_length, tile_length:-tile_length],
+            dtype=wp.uint8,
+        )
+
+        # Launch the main kernel for boundary masker
+        if bc.needs_mesh_distance:
+            wp.launch(
+                kernel_dict["kernel_with_distance"],
+                inputs=[mesh_id, bc_id, distances, bc_mask, missing_mask, solid_mask_cropped],
+                dim=bc_mask.shape[1:],
+            )
+        else:
+            wp.launch(
+                kernel_dict["kernel"],
+                inputs=[mesh_id, bc_id, bc_mask, missing_mask, solid_mask_cropped],
+                dim=bc_mask.shape[1:],
+            )
+
+        # Resolve out of bound indices
+        wp.launch(
+            self.resolve_out_of_bound_kernel,
+            inputs=[bc_id, bc_mask, missing_mask],
+            dim=bc_mask.shape[1:],
+        )
+        return distances, bc_mask, missing_mask
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 3c5ea867..1803fd68 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -69,7 +69,9 @@ def jax_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
         domain_shape = bc_mask[0].shape
         for bc in bclist:
             assert bc.indices is not None, f"Please specify indices associated with the {bc.__class__.__name__} BC!"
-            assert bc.mesh_vertices is None, f"Please use MeshBoundaryMasker operator if {bc.__class__.__name__} is imposed on a mesh (e.g. STL)!"
+            assert bc.mesh_vertices is None, (
+                f"Please use operators based on MeshBoundaryMasker if {bc.__class__.__name__} is imposed on a mesh (e.g. STL)!"
+            )
             id_number = bc.id
             bc_indices = np.array(bc.indices)
             local_indices = bc_indices - np.array(start_index)[:, np.newaxis]
@@ -106,9 +108,8 @@ def _construct_warp(self):
         _q = wp.constant(self.velocity_set.q)
 
         @wp.func
-        def check_index_bounds(index: wp.vec3i, shape: wp.vec3i):
-            is_in_bounds = index[0] >= 0 and index[0] < shape[0] and index[1] >= 0 and index[1] < shape[1] and index[2] >= 0 and index[2] < shape[2]
-            return is_in_bounds
+        def is_in_bounds(index: wp.vec3i, shape: wp.vec3i):
+            return index[0] >= 0 and index[0] < shape[0] and index[1] >= 0 and index[1] < shape[1] and index[2] >= 0 and index[2] < shape[2]
 
         # Construct the warp 3D kernel
         @wp.kernel
@@ -130,7 +131,7 @@ def kernel(
 
             # Check if index is in bounds
             shape = wp.vec3i(missing_mask.shape[1], missing_mask.shape[2], missing_mask.shape[3])
-            if check_index_bounds(index, shape):
+            if is_in_bounds(index, shape):
                 # Stream indices
                 for l in range(_q):
                     # Get the index of the streaming direction
@@ -145,12 +146,12 @@ def kernel(
 
                     # check if pull index is out of bound
                     # These directions will have missing information after streaming
-                    if not check_index_bounds(pull_index, shape):
+                    if not is_in_bounds(pull_index, shape):
                         # Set the missing mask
                         missing_mask[l, index[0], index[1], index[2]] = True
 
                     # handling geometries in the interior of the computational domain
-                    elif check_index_bounds(pull_index, shape) and is_interior[ii]:
+                    elif is_in_bounds(pull_index, shape) and is_interior[ii]:
                         # Set the missing mask
                         missing_mask[l, push_index[0], push_index[1], push_index[2]] = True
                         bc_mask[0, push_index[0], push_index[1], push_index[2]] = id_number[ii]
@@ -168,8 +169,9 @@ def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
         current_index = 0
         for bc in bclist:
             assert bc.indices is not None, f'Please specify indices associated with the {bc.__class__.__name__} BC using keyword "indices"!'
-            assert bc.mesh_vertices is None, f"Please use MeshBoundaryMasker operator if {bc.__class__.__name__} is imposed on a mesh (e.g. STL)!"
-
+            assert bc.mesh_vertices is None, (
+                f"Please use operators based on MeshBoundaryMasker if {bc.__class__.__name__} is imposed on a mesh (e.g. STL)!"
+            )
             bc_indices = np.asarray(bc.indices)
             num_indices = bc_indices.shape[1]
 
diff --git a/xlb/operator/boundary_masker/mesh_boundary_masker.py b/xlb/operator/boundary_masker/mesh_boundary_masker.py
index 40dd0311..1bcc2000 100644
--- a/xlb/operator/boundary_masker/mesh_boundary_masker.py
+++ b/xlb/operator/boundary_masker/mesh_boundary_masker.py
@@ -1,8 +1,7 @@
-# Base class for all equilibriums
+# Base class for mesh masker operators
 
 import numpy as np
 import warp as wp
-import jax
 from xlb.velocity_set.velocity_set import VelocitySet
 from xlb.precision_policy import PrecisionPolicy
 from xlb.compute_backend import ComputeBackend
@@ -27,30 +26,9 @@ def __init__(
         if self.velocity_set.d == 2:
             raise NotImplementedError("This Operator is not implemented in 2D!")
 
-        # Also using Warp kernels for JAX implementation
-        if self.compute_backend == ComputeBackend.JAX:
-            self.warp_functional, self.warp_kernel = self._construct_warp()
-
-    @Operator.register_backend(ComputeBackend.JAX)
-    def jax_implementation(
-        self,
-        bc,
-        bc_mask,
-        missing_mask,
-    ):
-        raise NotImplementedError(f"Operation {self.__class__.__name} not implemented in JAX!")
-        # Use Warp backend even for this particular operation.
-        wp.init()
-        bc_mask = wp.from_jax(bc_mask)
-        missing_mask = wp.from_jax(missing_mask)
-        bc_mask, missing_mask = self.warp_implementation(bc, bc_mask, missing_mask)
-        return wp.to_jax(bc_mask), wp.to_jax(missing_mask)
-
-    def _construct_warp(self):
         # Make constants for warp
-        _c_float = self.velocity_set.c_float
-        _q = wp.constant(self.velocity_set.q)
-        _opp_indices = self.velocity_set.opp_indices
+        _c = self.velocity_set.c
+        _q = self.velocity_set.q
 
         @wp.func
         def index_to_position(index: wp.vec3i):
@@ -59,6 +37,33 @@ def index_to_position(index: wp.vec3i):
             pos = ijk + wp.vec3(0.5, 0.5, 0.5)  # cell center
             return pos
 
+        @wp.func
+        def is_in_bounds(index: wp.vec3i, domain_shape: wp.vec3i):
+            return (
+                index[0] >= 0
+                and index[0] < domain_shape[0]
+                and index[1] >= 0
+                and index[1] < domain_shape[1]
+                and index[2] >= 0
+                and index[2] < domain_shape[2]
+            )
+
+        @wp.func
+        def out_of_bound_pull_index(
+            lattice_dir: wp.int32,
+            index: wp.vec3i,
+            domain_shape: wp.vec3i,
+        ):
+            # Get the index of the streaming direction
+            pull_index = wp.vec3i()
+            for d in range(self.velocity_set.d):
+                pull_index[d] = index[d] - _c[d, lattice_dir]
+
+            # check if pull index is out of bound
+            # These directions will have missing information after streaming
+            missing = not is_in_bounds(pull_index, domain_shape)
+            return missing
+
         # Function to precompute useful values per triangle, assuming spacing is (1,1,1)
         # inputs: verts: triangle vertices, normal: triangle unit normal
         # outputs: dist1, dist2, normal_edge0, normal_edge1, dist_edge
@@ -78,6 +83,7 @@ def pre_compute(
             dist_edge = wp.mat33f(0.0)
 
             for axis0 in range(0, 3):
+                axis1 = (axis0 + 1) % 3
                 axis2 = (axis0 + 2) % 3
 
                 sgn = 1.0
@@ -85,21 +91,18 @@ def pre_compute(
                     sgn = -1.0
 
                 for i in range(0, 3):
-                    normal_edge0[i][axis0] = -1.0 * sgn * edges[i][axis0]
-                    normal_edge1[i][axis0] = sgn * edges[i][axis0]
+                    normal_edge0[i, axis0] = -1.0 * sgn * edges[i, axis1]
+                    normal_edge1[i, axis0] = sgn * edges[i, axis0]
 
-                    dist_edge[i][axis0] = (
-                        -1.0 * (normal_edge0[i][axis0] * verts[i][axis0] + normal_edge1[i][axis0] * verts[i][axis0])
-                        + wp.max(0.0, normal_edge0[i][axis0])
-                        + wp.max(0.0, normal_edge1[i][axis0])
+                    dist_edge[i, axis0] = (
+                        -1.0 * (normal_edge0[i, axis0] * verts[i, axis0] + normal_edge1[i, axis0] * verts[i, axis1])
+                        + wp.max(0.0, normal_edge0[i, axis0])
+                        + wp.max(0.0, normal_edge1[i, axis0])
                     )
 
             return dist1, dist2, normal_edge0, normal_edge1, dist_edge
 
         # Check whether this triangle intersects the unit cube at position low
-        #  inputs: low: position of the cube, normal: triangle unit normal, dist1, dist2, normal_edge0, normal_edge1, dist_edge: precomputed values
-        #  outputs: True if intersection, False otherwise
-        #  reference: Fast parallel surface and solid voxelization on GPUs, M. Schwarz, H-P. Siedel, https://dl.acm.org/doi/10.1145/1882261.1866201
         @wp.func
         def triangle_box_intersect(
             low: wp.vec3f,
@@ -116,7 +119,7 @@ def triangle_box_intersect(
                 for ax0 in range(0, 3):
                     ax1 = (ax0 + 1) % 3
                     for i in range(0, 3):
-                        intersect = intersect and (normal_edge0[i][ax0] * low[ax0] + normal_edge1[i][ax0] * low[ax1] + dist_edge[i][ax0] >= 0.0)
+                        intersect = intersect and (normal_edge0[i, ax0] * low[ax0] + normal_edge1[i, ax0] * low[ax1] + dist_edge[i, ax0] >= 0.0)
 
                 return intersect
             else:
@@ -147,12 +150,8 @@ def mesh_voxel_intersect(mesh_id: wp.uint64, low: wp.vec3):
 
             return False
 
-        # Construct the warp kernel
-        # Do voxelization mesh query (warp.mesh_query_aabb) to find solid voxels
-        #  - this gives an approximate 1 voxel thick surface around mesh
         @wp.kernel
-        def kernel(
-            mesh_id: wp.uint64,
+        def resolve_out_of_bound_kernel(
             id_number: wp.int32,
             bc_mask: wp.array4d(dtype=wp.uint8),
             missing_mask: wp.array4d(dtype=wp.bool),
@@ -163,31 +162,43 @@ def kernel(
             # Get local indices
             index = wp.vec3i(i, j, k)
 
-            # position of the point
-            pos_bc_cell = index_to_position(index)
-            half = wp.vec3(0.5, 0.5, 0.5)
+            # domain shape to check for out of bounds
+            domain_shape = wp.vec3i(bc_mask.shape[1], bc_mask.shape[2], bc_mask.shape[3])
 
-            if mesh_voxel_intersect(mesh_id=mesh_id, low=pos_bc_cell - half):
-                # Make solid voxel
-                bc_mask[0, index[0], index[1], index[2]] = wp.uint8(255)
-            else:
-                # Find the fractional distance to the mesh in each direction
+            # Find the fractional distance to the mesh in each direction
+            if bc_mask[0, index[0], index[1], index[2]] == wp.uint8(id_number):
                 for l in range(1, _q):
-                    _dir = wp.vec3f(_c_float[0, l], _c_float[1, l], _c_float[2, l])
-
-                    # Check to see if this neighbor is solid - this is super inefficient TODO: make it way better
-                    if mesh_voxel_intersect(mesh_id=mesh_id, low=pos_bc_cell + _dir - half):
-                        # We know we have a solid neighbor
-                        # Set the boundary id and missing_mask
-                        bc_mask[0, index[0], index[1], index[2]] = wp.uint8(id_number)
-                        missing_mask[_opp_indices[l], index[0], index[1], index[2]] = True
+                    # Ensuring out of bound pull indices are properly considered in the missing_mask
+                    if out_of_bound_pull_index(l, index, domain_shape):
+                        missing_mask[l, index[0], index[1], index[2]] = True
+
+        # Construct some helper warp functions
+        if self.compute_backend == ComputeBackend.WARP:
+            self.index_to_position = index_to_position
+            self.is_in_bounds = is_in_bounds
+            self.out_of_bound_pull_index = out_of_bound_pull_index
+            self.mesh_voxel_intersect = mesh_voxel_intersect
+            self.resolve_out_of_bound_kernel = resolve_out_of_bound_kernel
 
-        return None, kernel
+    @Operator.register_backend(ComputeBackend.JAX)
+    def jax_implementation(
+        self,
+        bc,
+        bc_mask,
+        missing_mask,
+    ):
+        raise NotImplementedError(f"Operation {self.__class__.__name} not implemented in JAX!")
+        # Use Warp backend even for this particular operation.
+        wp.init()
+        bc_mask = wp.from_jax(bc_mask)
+        missing_mask = wp.from_jax(missing_mask)
+        bc_mask, missing_mask = self.warp_implementation(bc, bc_mask, missing_mask)
+        return wp.to_jax(bc_mask), wp.to_jax(missing_mask)
 
-    @Operator.register_backend(ComputeBackend.WARP)
-    def warp_implementation(
+    def warp_implementation_base(
         self,
         bc,
+        distances,
         bc_mask,
         missing_mask,
     ):
@@ -196,11 +207,9 @@ def warp_implementation(
         assert bc.mesh_vertices.shape[1] == self.velocity_set.d, (
             "Mesh points must be reshaped into an array (N, 3) where N indicates number of points!"
         )
-        mesh_vertices = bc.mesh_vertices
-        id_number = bc.id
 
-        # Check mesh extents against domain dimensions
         domain_shape = bc_mask.shape[1:]  # (nx, ny, nz)
+        mesh_vertices = bc.mesh_vertices
         mesh_min = np.min(mesh_vertices, axis=0)
         mesh_max = np.max(mesh_vertices, axis=0)
 
@@ -212,25 +221,31 @@ def warp_implementation(
         # We are done with bc.mesh_vertices. Remove them from BC objects
         bc.__dict__.pop("mesh_vertices", None)
 
-        # Ensure this masker is called only for BCs that need implicit distance to the mesh
-        assert not bc.needs_mesh_distance, 'Please use "MeshDistanceBoundaryMasker" if this BC needs mesh distance!'
-
         mesh_indices = np.arange(mesh_vertices.shape[0])
         mesh = wp.Mesh(
             points=wp.array(mesh_vertices, dtype=wp.vec3),
-            indices=wp.array(mesh_indices, dtype=int),
+            indices=wp.array(mesh_indices, dtype=wp.int32),
         )
-
-        # Launch the warp kernel
+        mesh_id = wp.uint64(mesh.id)
+        bc_id = bc.id
+
+        # Launch the appropriate warp kernel
+        kernel_list = self.warp_kernel
+        if bc.needs_mesh_distance:
+            wp.launch(
+                kernel_list[1],
+                inputs=[mesh_id, bc_id, distances, bc_mask, missing_mask],
+                dim=bc_mask.shape[1:],
+            )
+        else:
+            wp.launch(
+                kernel_list[0],
+                inputs=[mesh_id, bc_id, bc_mask, missing_mask],
+                dim=bc_mask.shape[1:],
+            )
         wp.launch(
-            self.warp_kernel,
-            inputs=[
-                mesh.id,
-                id_number,
-                bc_mask,
-                missing_mask,
-            ],
+            self.resolve_out_of_bound_kernel,
+            inputs=[bc_id, bc_mask, missing_mask],
             dim=bc_mask.shape[1:],
         )
-
-        return bc_mask, missing_mask
+        return distances, bc_mask, missing_mask
diff --git a/xlb/operator/boundary_masker/mesh_voxelization_method.py b/xlb/operator/boundary_masker/mesh_voxelization_method.py
new file mode 100644
index 00000000..ce5eb93e
--- /dev/null
+++ b/xlb/operator/boundary_masker/mesh_voxelization_method.py
@@ -0,0 +1,10 @@
+# Enum used to keep track of the available voxelization methods
+
+from enum import Enum, auto
+
+
+class MeshVoxelizationMethod(Enum):
+    AABB = auto()
+    RAY = auto()
+    AABB_FILL = auto()
+    WINDING = auto()
diff --git a/xlb/operator/boundary_masker/ray.py b/xlb/operator/boundary_masker/ray.py
new file mode 100644
index 00000000..d5ec27c2
--- /dev/null
+++ b/xlb/operator/boundary_masker/ray.py
@@ -0,0 +1,107 @@
+import warp as wp
+from typing import Any
+from xlb.velocity_set.velocity_set import VelocitySet
+from xlb.precision_policy import PrecisionPolicy
+from xlb.compute_backend import ComputeBackend
+from xlb.operator.boundary_masker.mesh_boundary_masker import MeshBoundaryMasker
+from xlb.operator.operator import Operator
+
+
+class MeshMaskerRay(MeshBoundaryMasker):
+    """
+    Operator for creating a boundary missing_mask from an STL file
+    """
+
+    def __init__(
+        self,
+        velocity_set: VelocitySet,
+        precision_policy: PrecisionPolicy,
+        compute_backend: ComputeBackend.WARP,
+    ):
+        # Call super
+        super().__init__(velocity_set, precision_policy, compute_backend)
+
+    def _construct_warp(self):
+        # Make constants for warp
+        _c = self.velocity_set.c
+        _q = self.velocity_set.q
+        _opp_indices = self.velocity_set.opp_indices
+
+        @wp.kernel
+        def kernel(
+            mesh_id: wp.uint64,
+            id_number: wp.int32,
+            bc_mask: wp.array4d(dtype=wp.uint8),
+            missing_mask: wp.array4d(dtype=wp.bool),
+        ):
+            # get index
+            i, j, k = wp.tid()
+
+            # Get local indices
+            index = wp.vec3i(i, j, k)
+
+            # position of the point
+            pos_bc_cell = self.index_to_position(index)
+
+            for l in range(1, _q):
+                _dir = wp.vec3f(wp.float32(_c[0, l]), wp.float32(_c[1, l]), wp.float32(_c[2, l]))
+                # Max length depends on ray direction (diagonals are longer)
+                max_length = wp.length(_dir)
+                query = wp.mesh_query_ray(mesh_id, pos_bc_cell, _dir / max_length, max_length)
+                if query.result:
+                    # Set the boundary id and missing_mask
+                    bc_mask[0, index[0], index[1], index[2]] = wp.uint8(id_number)
+                    missing_mask[_opp_indices[l], index[0], index[1], index[2]] = True
+
+        @wp.kernel
+        def kernel_with_distance(
+            mesh_id: wp.uint64,
+            id_number: wp.int32,
+            distances: wp.array4d(dtype=Any),
+            bc_mask: wp.array4d(dtype=wp.uint8),
+            missing_mask: wp.array4d(dtype=wp.bool),
+        ):
+            # get index
+            i, j, k = wp.tid()
+
+            # Get local indices
+            index = wp.vec3i(i, j, k)
+
+            # position of the point
+            pos_bc_cell = self.index_to_position(index)
+
+            # Find the fractional distance to the mesh in each direction
+            for l in range(1, _q):
+                _dir = wp.vec3f(wp.float32(_c[0, l]), wp.float32(_c[1, l]), wp.float32(_c[2, l]))
+                # Max length depends on ray direction (diagonals are longer)
+                max_length = wp.length(_dir)
+                query = wp.mesh_query_ray(mesh_id, pos_bc_cell, _dir / max_length, max_length)
+                if query.result:
+                    # Set the boundary id and missing_mask
+                    bc_mask[0, index[0], index[1], index[2]] = wp.uint8(id_number)
+                    missing_mask[_opp_indices[l], index[0], index[1], index[2]] = True
+
+                    # get position of the mesh triangle that intersects with the ray
+                    pos_mesh = wp.mesh_eval_position(mesh_id, query.face, query.u, query.v)
+                    dist = wp.length(pos_mesh - pos_bc_cell)
+                    weight = self.store_dtype(dist / max_length)
+                    distances[l, index[0], index[1], index[2]] = weight
+                    # if weight < 0.0 or weight > 1.0:
+                    #     wp.printf("Got bad weight %f at %d,%d,%d\n", weight, index[0], index[1], index[2])
+
+        return None, [kernel, kernel_with_distance]
+
+    @Operator.register_backend(ComputeBackend.WARP)
+    def warp_implementation(
+        self,
+        bc,
+        distances,
+        bc_mask,
+        missing_mask,
+    ):
+        return self.warp_implementation_base(
+            bc,
+            distances,
+            bc_mask,
+            missing_mask,
+        )
diff --git a/xlb/operator/boundary_masker/winding.py b/xlb/operator/boundary_masker/winding.py
new file mode 100644
index 00000000..fc813012
--- /dev/null
+++ b/xlb/operator/boundary_masker/winding.py
@@ -0,0 +1,145 @@
+import warp as wp
+from typing import Any
+from xlb.velocity_set.velocity_set import VelocitySet
+from xlb.precision_policy import PrecisionPolicy
+from xlb.compute_backend import ComputeBackend
+from xlb.operator.boundary_masker.mesh_boundary_masker import MeshBoundaryMasker
+from xlb.operator.operator import Operator
+
+
+class MeshMaskerWinding(MeshBoundaryMasker):
+    """
+    Operator for creating a boundary missing_mask from an STL file
+    """
+
+    def __init__(
+        self,
+        velocity_set: VelocitySet,
+        precision_policy: PrecisionPolicy,
+        compute_backend: ComputeBackend.WARP,
+    ):
+        # Call super
+        super().__init__(velocity_set, precision_policy, compute_backend)
+
+    def _construct_warp(self):
+        # Make constants for warp
+        _c = self.velocity_set.c
+        _q = self.velocity_set.q
+        _opp_indices = self.velocity_set.opp_indices
+
+        @wp.kernel
+        def kernel(
+            mesh_id: wp.uint64,
+            id_number: wp.int32,
+            bc_mask: wp.array4d(dtype=wp.uint8),
+            missing_mask: wp.array4d(dtype=wp.bool),
+        ):
+            # get index
+            i, j, k = wp.tid()
+
+            # Get local indices
+            index = wp.vec3i(i, j, k)
+
+            # position of the point
+            pos_bc_cell = self.index_to_position(index)
+
+            # Compute the maximum length
+            max_length = wp.sqrt(
+                (wp.float32(bc_mask.shape[1])) ** 2.0 + (wp.float32(bc_mask.shape[2])) ** 2.0 + (wp.float32(bc_mask.shape[3])) ** 2.0
+            )
+
+            # evaluate if point is inside mesh
+            query = wp.mesh_query_point_sign_winding_number(mesh_id, pos_bc_cell, max_length)
+            if query.result:
+                # set point to be solid
+                if query.sign <= 0:  # TODO: fix this
+                    # Make solid voxel
+                    bc_mask[0, index[0], index[1], index[2]] = wp.uint8(255)
+
+                    # Find the fractional distance to the mesh in each direction
+                    for l in range(1, _q):
+                        _dir = wp.vec3f(wp.float32(_c[0, l]), wp.float32(_c[1, l]), wp.float32(_c[2, l]))
+                        # Max length depends on ray direction (diagonals are longer)
+                        max_length = wp.length(_dir)
+                        query_dir = wp.mesh_query_ray(mesh_id, pos_bc_cell, _dir / max_length, max_length)
+                        if query_dir.result:
+                            # Get the index of the streaming direction
+                            push_index = wp.vec3i()
+                            for d in range(self.velocity_set.d):
+                                push_index[d] = index[d] + _c[d, l]
+
+                            # Set the boundary id and missing_mask
+                            bc_mask[0, push_index[0], push_index[1], push_index[2]] = wp.uint8(id_number)
+                            missing_mask[l, push_index[0], push_index[1], push_index[2]] = True
+
+        @wp.kernel
+        def kernel_with_distance(
+            mesh_id: wp.uint64,
+            id_number: wp.int32,
+            distances: wp.array4d(dtype=Any),
+            bc_mask: wp.array4d(dtype=wp.uint8),
+            missing_mask: wp.array4d(dtype=wp.bool),
+        ):
+            # get index
+            i, j, k = wp.tid()
+
+            # Get local indices
+            index = wp.vec3i(i, j, k)
+
+            # position of the point
+            pos_cell = self.index_to_position(index)
+
+            # Compute the maximum length
+            max_length = wp.sqrt(
+                (wp.float32(bc_mask.shape[1])) ** 2.0 + (wp.float32(bc_mask.shape[2])) ** 2.0 + (wp.float32(bc_mask.shape[3])) ** 2.0
+            )
+
+            # evaluate if point is inside mesh
+            query = wp.mesh_query_point_sign_winding_number(mesh_id, pos_cell, max_length)
+            if query.result:
+                # set point to be solid
+                if query.sign <= 0:  # TODO: fix this
+                    # Make solid voxel
+                    bc_mask[0, index[0], index[1], index[2]] = wp.uint8(255)
+
+                    # Find the fractional distance to the mesh in each direction
+                    for l in range(1, _q):
+                        _dir = wp.vec3f(wp.float32(_c[0, l]), wp.float32(_c[1, l]), wp.float32(_c[2, l]))
+                        # Max length depends on ray direction (diagonals are longer)
+                        max_length = wp.length(_dir)
+                        query_dir = wp.mesh_query_ray(mesh_id, pos_cell, _dir / max_length, max_length)
+                        if query_dir.result:
+                            # Get the index of the streaming direction
+                            push_index = wp.vec3i()
+                            for d in range(self.velocity_set.d):
+                                push_index[d] = index[d] + _c[d, l]
+
+                            # Set the boundary id and missing_mask
+                            bc_mask[0, push_index[0], push_index[1], push_index[2]] = wp.uint8(id_number)
+                            missing_mask[l, push_index[0], push_index[1], push_index[2]] = True
+
+                            # get position of the mesh triangle that intersects with the ray
+                            pos_mesh = wp.mesh_eval_position(mesh_id, query_dir.face, query_dir.u, query_dir.v)
+                            pos_bc_cell = self.index_to_position(push_index)
+                            dist = wp.length(pos_mesh - pos_bc_cell)
+                            weight = self.store_dtype(dist / max_length)
+                            distances[_opp_indices[l], push_index[0], push_index[1], push_index[2]] = weight
+                            # if weight < 0.0 or weight > 1.0:
+                            #     wp.printf("Got bad weight %f at %d,%d,%d\n", weight, push_index[0], push_index[1], push_index[2])
+
+        return None, [kernel, kernel_with_distance]
+
+    @Operator.register_backend(ComputeBackend.WARP)
+    def warp_implementation(
+        self,
+        bc,
+        distances,
+        bc_mask,
+        missing_mask,
+    ):
+        return self.warp_implementation_base(
+            bc,
+            distances,
+            bc_mask,
+            missing_mask,
+        )
diff --git a/xlb/operator/force/momentum_transfer.py b/xlb/operator/force/momentum_transfer.py
index 1c6255d3..88f0d4b2 100644
--- a/xlb/operator/force/momentum_transfer.py
+++ b/xlb/operator/force/momentum_transfer.py
@@ -139,7 +139,7 @@ def kernel(
                 # Get the distribution function
                 f_post_collision = _f_vec()
                 for l in range(self.velocity_set.q):
-                    f_post_collision[l] = f_0[l, index[0], index[1], index[2]]
+                    f_post_collision[l] = self.compute_dtype(f_0[l, index[0], index[1], index[2]])
 
                 # Apply streaming (pull method)
                 timestep = 0
diff --git a/xlb/operator/macroscopic/first_moment.py b/xlb/operator/macroscopic/first_moment.py
index cb99a9ff..c29445d7 100644
--- a/xlb/operator/macroscopic/first_moment.py
+++ b/xlb/operator/macroscopic/first_moment.py
@@ -23,17 +23,31 @@ def _construct_warp(self):
         _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
 
         @wp.func
-        def functional(
-            f: _f_vec,
-            rho: Any,
-        ):
-            u = _u_vec()
+        def neumaier_sum_component(d: int, f: _f_vec):
+            total = self.compute_dtype(0.0)
+            compensation = self.compute_dtype(0.0)
             for l in range(self.velocity_set.q):
-                for d in range(self.velocity_set.d):
-                    if _c[d, l] == 1:
-                        u[d] += f[l]
-                    elif _c[d, l] == -1:
-                        u[d] -= f[l]
+                # Get contribution based on the sign of _c[d, l]
+                if _c[d, l] == 1:
+                    val = f[l]
+                elif _c[d, l] == -1:
+                    val = -f[l]
+                else:
+                    val = self.compute_dtype(0.0)
+                t = total + val
+                if wp.abs(total) >= wp.abs(val):
+                    compensation = compensation + ((total - t) + val)
+                else:
+                    compensation = compensation + ((val - t) + total)
+                total = t
+            return total + compensation
+
+        @wp.func
+        def functional(f: _f_vec, rho: Any):
+            u = _u_vec()
+            # Use Neumaier summation for each spatial component
+            for d in range(self.velocity_set.d):
+                u[d] = neumaier_sum_component(d, f)
             u /= rho
             return u
 
diff --git a/xlb/operator/macroscopic/zero_moment.py b/xlb/operator/macroscopic/zero_moment.py
index 8abb4de7..c81dcfed 100644
--- a/xlb/operator/macroscopic/zero_moment.py
+++ b/xlb/operator/macroscopic/zero_moment.py
@@ -20,11 +20,23 @@ def _construct_warp(self):
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
 
         @wp.func
-        def functional(f: _f_vec):
-            rho = self.compute_dtype(0.0)
+        def neumaier_sum(f: _f_vec):
+            total = self.compute_dtype(0.0)
+            compensation = self.compute_dtype(0.0)
             for l in range(self.velocity_set.q):
-                rho += f[l]
-            return rho
+                x = f[l]
+                t = total + x
+                # Using wp.abs to compute absolute value
+                if wp.abs(total) >= wp.abs(x):
+                    compensation = compensation + ((total - t) + x)
+                else:
+                    compensation = compensation + ((x - t) + total)
+                total = t
+            return total + compensation
+
+        @wp.func
+        def functional(f: _f_vec):
+            return neumaier_sum(f)
 
         @wp.kernel
         def kernel(
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 7326891a..3a17d780 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -17,7 +17,14 @@
 from xlb.operator.boundary_condition.boundary_condition import ImplementationStep
 from xlb.operator.boundary_condition.boundary_condition_registry import boundary_condition_registry
 from xlb.operator.collision import ForcedCollision
-from xlb.operator.boundary_masker import IndicesBoundaryMasker, MeshBoundaryMasker
+from xlb.operator.boundary_masker import (
+    IndicesBoundaryMasker,
+    MeshVoxelizationMethod,
+    MeshMaskerAABB,
+    MeshMaskerRay,
+    MeshMaskerWinding,
+    MeshMaskerAABBFill,
+)
 from xlb.helper import check_bc_overlaps
 from xlb.helper.nse_solver import create_nse_fields
 
@@ -70,7 +77,7 @@ def prepare_fields(self, initializer=None):
 
         # Initialize distribution functions if initializer is provided
         if initializer is not None:
-            f_0 = initializer(self.grid, self.velocity_set, self.precision_policy, self.compute_backend)
+            f_0 = initializer(f_0)
         else:
             from xlb.helper.initializers import initialize_eq
 
@@ -82,49 +89,79 @@ def prepare_fields(self, initializer=None):
         else:
             wp.copy(f_1, f_0)
 
+        # Important note: XLB uses f_1 buffer (center index and missing directions) to store auxiliary data for boundary conditions.
+
         # Process boundary conditions and update masks
-        bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, bc_mask, missing_mask)
+        f_1, bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, f_1, bc_mask, missing_mask)
+
         # Initialize auxiliary data if needed
-        f_0, f_1 = self._initialize_auxiliary_data(self.boundary_conditions, f_0, f_1, bc_mask, missing_mask)
+        f_1 = self._initialize_auxiliary_data(self.boundary_conditions, f_1, bc_mask, missing_mask)
 
         return f_0, f_1, bc_mask, missing_mask
 
     @classmethod
-    def _process_boundary_conditions(cls, boundary_conditions, bc_mask, missing_mask):
+    def _process_boundary_conditions(cls, boundary_conditions, f_1, bc_mask, missing_mask):
         """Process boundary conditions and update boundary masks."""
+
         # Check for boundary condition overlaps
         check_bc_overlaps(boundary_conditions, DefaultConfig.velocity_set.d, DefaultConfig.default_backend)
+
         # Create boundary maskers
         indices_masker = IndicesBoundaryMasker(
             velocity_set=DefaultConfig.velocity_set,
             precision_policy=DefaultConfig.default_precision_policy,
             compute_backend=DefaultConfig.default_backend,
         )
+
         # Split boundary conditions by type
         bc_with_vertices = [bc for bc in boundary_conditions if bc.mesh_vertices is not None]
         bc_with_indices = [bc for bc in boundary_conditions if bc.indices is not None]
+
         # Process indices-based boundary conditions
         if bc_with_indices:
             bc_mask, missing_mask = indices_masker(bc_with_indices, bc_mask, missing_mask)
+
         # Process mesh-based boundary conditions for 3D
         if DefaultConfig.velocity_set.d == 3 and bc_with_vertices:
-            mesh_masker = MeshBoundaryMasker(
-                velocity_set=DefaultConfig.velocity_set,
-                precision_policy=DefaultConfig.default_precision_policy,
-                compute_backend=DefaultConfig.default_backend,
-            )
             for bc in bc_with_vertices:
-                bc_mask, missing_mask = mesh_masker(bc, bc_mask, missing_mask)
+                if bc.voxelization_method is MeshVoxelizationMethod.AABB:
+                    mesh_masker = MeshMaskerAABB(
+                        velocity_set=DefaultConfig.velocity_set,
+                        precision_policy=DefaultConfig.default_precision_policy,
+                        compute_backend=DefaultConfig.default_backend,
+                    )
+                elif bc.voxelization_method is MeshVoxelizationMethod.RAY:
+                    mesh_masker = MeshMaskerRay(
+                        velocity_set=DefaultConfig.velocity_set,
+                        precision_policy=DefaultConfig.default_precision_policy,
+                        compute_backend=DefaultConfig.default_backend,
+                    )
+                elif bc.voxelization_method is MeshVoxelizationMethod.WINDING:
+                    mesh_masker = MeshMaskerWinding(
+                        velocity_set=DefaultConfig.velocity_set,
+                        precision_policy=DefaultConfig.default_precision_policy,
+                        compute_backend=DefaultConfig.default_backend,
+                    )
+                elif bc.voxelization_method is MeshVoxelizationMethod.AABB_FILL:
+                    mesh_masker = MeshMaskerAABBFill(
+                        velocity_set=DefaultConfig.velocity_set,
+                        precision_policy=DefaultConfig.default_precision_policy,
+                        compute_backend=DefaultConfig.default_backend,
+                    )
+                else:
+                    raise ValueError(f"Unsupported voxelization method: {bc.voxelization_method}")
+                # Apply the mesh masker to the boundary condition
+                f_1, bc_mask, missing_mask = mesh_masker(bc, f_1, bc_mask, missing_mask)
 
-        return bc_mask, missing_mask
+        return f_1, bc_mask, missing_mask
 
     @staticmethod
-    def _initialize_auxiliary_data(boundary_conditions, f_0, f_1, bc_mask, missing_mask):
+    def _initialize_auxiliary_data(boundary_conditions, f_1, bc_mask, missing_mask):
         """Initialize auxiliary data for boundary conditions that require it."""
         for bc in boundary_conditions:
             if bc.needs_aux_init and not bc.is_initialized_with_aux_data:
-                f_0, f_1 = bc.aux_data_init(f_0, f_1, bc_mask, missing_mask)
-        return f_0, f_1
+                f_1 = bc.aux_data_init(f_1, bc_mask, missing_mask)
+        return f_1
 
     @Operator.register_backend(ComputeBackend.JAX)
     @partial(jit, static_argnums=(0,))
@@ -160,7 +197,7 @@ def jax_implementation(self, f_0, f_1, bc_mask, missing_mask, omega, timestep):
 
         # Apply collision type boundary conditions
         for bc in self.boundary_conditions:
-            f_post_collision = bc.update_bc_auxilary_data(f_post_stream, f_post_collision, bc_mask, missing_mask)
+            f_post_collision = bc.assemble_dynamic_data(f_post_stream, f_post_collision, bc_mask, missing_mask)
             if bc.implementation_step == ImplementationStep.COLLISION:
                 f_post_collision = bc(
                     f_post_stream,
@@ -182,15 +219,12 @@ def _construct_warp(self):
 
         # Read the list of bc_to_id created upon instantiation
         bc_to_id = boundary_condition_registry.bc_to_id
-        id_to_bc = boundary_condition_registry.id_to_bc
 
         # Gather IDs of ExtrapolationOutflowBC boundary conditions
         extrapolation_outflow_bc_ids = []
         for bc_name, bc_id in bc_to_id.items():
             if bc_name.startswith("ExtrapolationOutflowBC"):
                 extrapolation_outflow_bc_ids.append(bc_id)
-        # Group active boundary conditions
-        active_bcs = set(boundary_condition_registry.id_to_bc[bc.id] for bc in self.boundary_conditions)
 
         _opp_indices = self.velocity_set.opp_indices
 
@@ -198,8 +232,9 @@ def _construct_warp(self):
         def apply_bc(
             index: Any,
             timestep: Any,
+            bc_mask: Any,
             _boundary_id: Any,
-            missing_mask: Any,
+            _missing_mask: Any,
             f_0: Any,
             f_1: Any,
             f_pre: Any,
@@ -213,15 +248,15 @@ def apply_bc(
                 if is_post_streaming:
                     if wp.static(self.boundary_conditions[i].implementation_step == ImplementationStep.STREAMING):
                         if _boundary_id == wp.static(self.boundary_conditions[i].id):
-                            f_result = wp.static(self.boundary_conditions[i].warp_functional)(index, timestep, missing_mask, f_0, f_1, f_pre, f_post)
+                            f_result = wp.static(self.boundary_conditions[i].warp_functional)(index, timestep, _missing_mask, f_0, f_1, f_pre, f_post)
                 else:
                     if wp.static(self.boundary_conditions[i].implementation_step == ImplementationStep.COLLISION):
                         if _boundary_id == wp.static(self.boundary_conditions[i].id):
-                            f_result = wp.static(self.boundary_conditions[i].warp_functional)(index, timestep, missing_mask, f_0, f_1, f_pre, f_post)
+                            f_result = wp.static(self.boundary_conditions[i].warp_functional)(index, timestep, _missing_mask, f_0, f_1, f_pre, f_post)
                     if wp.static(self.boundary_conditions[i].id in extrapolation_outflow_bc_ids):
                         if _boundary_id == wp.static(self.boundary_conditions[i].id):
-                            f_result = wp.static(self.boundary_conditions[i].update_bc_auxilary_data)(
-                                index, timestep, missing_mask, f_0, f_1, f_pre, f_post
+                            f_result = wp.static(self.boundary_conditions[i].assemble_dynamic_data)(
+                                index, timestep, _missing_mask, f_0, f_1, f_pre, f_post
                             )
             return f_result
 
@@ -296,14 +331,14 @@ def kernel(
             _f_post_collision = _f0_thread
 
             # Apply post-streaming boundary conditions
-            _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0, f_1, _f_post_collision, _f_post_stream, True)
+            _f_post_stream = apply_bc(index, timestep, bc_mask, _boundary_id, _missing_mask, f_0, f_1, _f_post_collision, _f_post_stream, True)
 
             _rho, _u = self.macroscopic.warp_functional(_f_post_stream)
             _feq = self.equilibrium.warp_functional(_rho, _u)
             _f_post_collision = self.collision.warp_functional(_f_post_stream, _feq, _rho, _u, omega)
 
             # Apply post-collision boundary conditions
-            _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0, f_1, _f_post_stream, _f_post_collision, False)
+            _f_post_collision = apply_bc(index, timestep, bc_mask, _boundary_id, _missing_mask, f_0, f_1, _f_post_stream, _f_post_collision, False)
 
             # Apply auxiliary recovery for boundary conditions (swapping)
             apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0, _f1_thread)
diff --git a/xlb/utils/__init__.py b/xlb/utils/__init__.py
index 3c8032e2..213e936d 100644
--- a/xlb/utils/__init__.py
+++ b/xlb/utils/__init__.py
@@ -1,9 +1,14 @@
-from .utils import (
-    downsample_field,
-    save_image,
-    save_fields_vtk,
-    save_BCs_vtk,
-    rotate_geometry,
-    voxelize_stl,
-    axangle2mat,
-)
+from .utils import (
+    downsample_field,
+    save_image,
+    save_fields_vtk,
+    save_BCs_vtk,
+    rotate_geometry,
+    voxelize_stl,
+    axangle2mat,
+    save_fields_hdf5,
+    voxelize_stl_open3d,
+    q_criterion,
+    map_field_vtk,
+    map_field_vtk_interpolator,
+)
diff --git a/xlb/utils/utils.py b/xlb/utils/utils.py
index 0a9858a5..dda4534f 100644
--- a/xlb/utils/utils.py
+++ b/xlb/utils/utils.py
@@ -1,316 +1,651 @@
-import numpy as np
-import matplotlib.pylab as plt
-from matplotlib import cm
-from time import time
-import pyvista as pv
-from jax.image import resize
-from jax import jit
-import jax.numpy as jnp
-from functools import partial
-import trimesh
-
-import os
-import __main__
-
-
-@partial(jit, static_argnums=(1, 2))
-def downsample_field(field, factor, method="bicubic"):
-    """
-    Downsample a JAX array by a factor of `factor` along each axis.
-
-    Parameters
-    ----------
-    field : jax.numpy.ndarray
-        The input vector field to be downsampled. This should be a 3D or 4D JAX array where the last dimension is 2 or 3 (vector components).
-    factor : int
-        The factor by which to downsample the field. The dimensions of the field will be divided by this factor.
-    method : str, optional
-        The method to use for downsampling. Default is 'bicubic'.
-
-    Returns
-    -------
-    jax.numpy.ndarray
-        The downsampled field.
-    """
-    if factor == 1:
-        return field
-    else:
-        new_shape = tuple(dim // factor for dim in field.shape[:-1])
-        downsampled_components = []
-        for i in range(field.shape[-1]):  # Iterate over the last dimension (vector components)
-            resized = resize(field[..., i], new_shape, method=method)
-            downsampled_components.append(resized)
-
-        return jnp.stack(downsampled_components, axis=-1)
-
-
-def save_image(fld, timestep=None, prefix=None, **kwargs):
-    """
-    Save an image of a field at a given timestep.
-
-    Parameters
-    ----------
-    timestep : int
-        The timestep at which the field is being saved.
-    fld : jax.numpy.ndarray
-        The field to be saved. This should be a 2D or 3D JAX array. If the field is 3D, the magnitude of the field will be calculated and saved.
-    prefix : str, optional
-        A prefix to be added to the filename. The filename will be the name of the main script file by default.
-
-    Returns
-    -------
-    None
-
-    Notes
-    -----
-    This function saves the field as an image in the PNG format.
-    The filename is based on the name of the main script file, the provided prefix, and the timestep number.
-    If the field is 3D, the magnitude of the field is calculated and saved.
-    The image is saved with the 'nipy_spectral' colormap and the origin set to 'lower'.
-    """
-    if prefix is None:
-        fname = os.path.basename(__main__.__file__)
-        fname = os.path.splitext(fname)[0]
-    else:
-        fname = prefix
-
-    if timestep is not None:
-        fname = fname + "_" + str(timestep).zfill(4)
-
-    if len(fld.shape) > 3:
-        raise ValueError("The input field should be 2D!")
-    if len(fld.shape) == 3:
-        fld = np.sqrt(fld[0, ...] ** 2 + fld[0, ...] ** 2)
-
-    plt.clf()
-    kwargs.pop("cmap", None)
-    plt.imsave(fname + ".png", fld.T, cmap=cm.nipy_spectral, origin="lower", **kwargs)
-
-
-def save_fields_vtk(fields, timestep, output_dir=".", prefix="fields"):
-    """
-    Save VTK fields to the specified directory.
-
-    Parameters
-    ----------
-    timestep (int): The timestep number to be associated with the saved fields.
-    fields (Dict[str, np.ndarray]): A dictionary of fields to be saved. Each field must be an array-like object
-        with dimensions (nx, ny) for 2D fields or (nx, ny, nz) for 3D fields, where:
-            - nx : int, number of grid points along the x-axis
-            - ny : int, number of grid points along the y-axis
-            - nz : int, number of grid points along the z-axis (for 3D fields only)
-        The key value for each field in the dictionary must be a string containing the name of the field.
-    output_dir (str, optional, default: '.'): The directory in which to save the VTK files. Defaults to the current directory.
-    prefix (str, optional, default: 'fields'): A prefix to be added to the filename. Defaults to 'fields'.
-
-    Returns
-    -------
-    None
-
-    Notes
-    -----
-    This function saves the VTK fields in the specified directory, with filenames based on the provided timestep number
-    and the filename. For example, if the timestep number is 10 and the file name is fields, the VTK file
-    will be saved as 'fields_0000010.vtk'in the specified directory.
-
-    """
-    # Assert that all fields have the same dimensions
-    for key, value in fields.items():
-        if key == list(fields.keys())[0]:
-            dimensions = value.shape
-        else:
-            assert value.shape == dimensions, "All fields must have the same dimensions!"
-
-    output_filename = os.path.join(output_dir, prefix + "_" + f"{timestep:07d}.vtk")
-
-    # Add 1 to the dimensions tuple as we store cell values
-    dimensions = tuple([dim + 1 for dim in dimensions])
-
-    # Create a uniform grid
-    if value.ndim == 2:
-        dimensions = dimensions + (1,)
-
-    grid = pv.ImageData(dimensions=dimensions)
-
-    # Add the fields to the grid
-    for key, value in fields.items():
-        grid[key] = value.flatten(order="F")
-
-    # Save the grid to a VTK file
-    start = time()
-    grid.save(output_filename, binary=True)
-    print(f"Saved {output_filename} in {time() - start:.6f} seconds.")
-
-
-def save_BCs_vtk(timestep, BCs, gridInfo, output_dir="."):
-    """
-    Save boundary conditions as VTK format to the specified directory.
-
-    Parameters
-    ----------
-    timestep (int): The timestep number to be associated with the saved fields.
-    BCs (List[BC]): A list of boundary conditions to be saved. Each boundary condition must be an object of type BC.
-
-    Returns
-    -------
-    None
-
-    Notes
-    -----
-    This function saves the boundary conditions in the specified directory, with filenames based on the provided timestep number
-    and the filename. For example, if the timestep number is 10, the VTK file
-    will be saved as 'BCs_0000010.vtk'in the specified directory.
-    """
-
-    # Create a uniform grid
-    if gridInfo["nz"] == 0:
-        gridDimensions = (gridInfo["nx"] + 1, gridInfo["ny"] + 1, 1)
-        fieldDimensions = (gridInfo["nx"], gridInfo["ny"], 1)
-    else:
-        gridDimensions = (gridInfo["nx"] + 1, gridInfo["ny"] + 1, gridInfo["nz"] + 1)
-        fieldDimensions = (gridInfo["nx"], gridInfo["ny"], gridInfo["nz"])
-
-    grid = pv.ImageData(dimensions=gridDimensions)
-
-    # Dictionary to keep track of encountered BC names
-    bcNamesCount = {}
-
-    for bc in BCs:
-        bcName = bc.name
-        if bcName in bcNamesCount:
-            bcNamesCount[bcName] += 1
-        else:
-            bcNamesCount[bcName] = 0
-        bcName += f"_{bcNamesCount[bcName]}"
-
-        if bc.isDynamic:
-            bcIndices, _ = bc.update_function(timestep)
-        else:
-            bcIndices = bc.indices
-
-        # Convert indices to 1D indices
-        if gridInfo["dim"] == 2:
-            bcIndices = np.ravel_multi_index(bcIndices, fieldDimensions[:-1], order="F")
-        else:
-            bcIndices = np.ravel_multi_index(bcIndices, fieldDimensions, order="F")
-
-        grid[bcName] = np.zeros(fieldDimensions, dtype=bool).flatten(order="F")
-        grid[bcName][bcIndices] = True
-
-    # Save the grid to a VTK file
-    output_filename = os.path.join(output_dir, "BCs_" + f"{timestep:07d}.vtk")
-
-    start = time()
-    grid.save(output_filename, binary=True)
-    print(f"Saved {output_filename} in {time() - start:.6f} seconds.")
-
-
-def rotate_geometry(indices, origin, axis, angle):
-    """
-    Rotates a voxelized mesh around a given axis.
-
-    Parameters
-    ----------
-    indices : array-like
-        The indices of the voxels in the mesh.
-    origin : array-like
-        The coordinates of the origin of the rotation axis.
-    axis : array-like
-        The direction vector of the rotation axis. This should be a 3-element sequence.
-    angle : float
-        The angle by which to rotate the mesh, in radians.
-
-    Returns
-    -------
-    tuple
-        The indices of the voxels in the rotated mesh.
-
-    Notes
-    -----
-    This function rotates the mesh by applying a rotation matrix to the voxel indices. The rotation matrix is calculated
-    using the axis-angle representation of rotations. The origin of the rotation axis is assumed to be at (0, 0, 0).
-    """
-    indices_rotated = (jnp.array(indices).T - origin) @ axangle2mat(axis, angle) + origin
-    return tuple(jnp.rint(indices_rotated).astype("int32").T)
-
-
-def voxelize_stl(stl_filename, length_lbm_unit=None, tranformation_matrix=None, pitch=None):
-    """
-    Converts an STL file to a voxelized mesh.
-
-    Parameters
-    ----------
-    stl_filename : str
-        The name of the STL file to be voxelized.
-    length_lbm_unit : float, optional
-        The unit length in LBM. Either this or 'pitch' must be provided.
-    tranformation_matrix : array-like, optional
-        A transformation matrix to be applied to the mesh before voxelization.
-    pitch : float, optional
-        The pitch of the voxel grid. Either this or 'length_lbm_unit' must be provided.
-
-    Returns
-    -------
-    trimesh.VoxelGrid, float
-        The voxelized mesh and the pitch of the voxel grid.
-
-    Notes
-    -----
-    This function uses the trimesh library to load the STL file and voxelized the mesh. If a transformation matrix is
-    provided, it is applied to the mesh before voxelization. The pitch of the voxel grid is calculated based on the
-    maximum extent of the mesh and the provided lattice Boltzmann unit length, unless a pitch is provided directly.
-    """
-    if length_lbm_unit is None and pitch is None:
-        raise ValueError("Either 'length_lbm_unit' or 'pitch' must be provided!")
-    mesh = trimesh.load_mesh(stl_filename, process=False)
-    length_phys_unit = mesh.extents.max()
-    if tranformation_matrix is not None:
-        mesh.apply_transform(tranformation_matrix)
-    if pitch is None:
-        pitch = length_phys_unit / length_lbm_unit
-    mesh_voxelized = mesh.voxelized(pitch=pitch)
-    return mesh_voxelized, pitch
-
-
-def axangle2mat(axis, angle, is_normalized=False):
-    """Rotation matrix for rotation angle `angle` around `axis`
-    Parameters
-    ----------
-    axis : 3 element sequence
-       vector specifying axis for rotation.
-    angle : scalar
-       angle of rotation in radians.
-    is_normalized : bool, optional
-       True if `axis` is already normalized (has norm of 1).  Default False.
-    Returns
-    -------
-    mat : array shape (3,3)
-       rotation matrix for specified rotation
-    Notes
-    -----
-    From : https://github.com/matthew-brett/transforms3d
-    Ref : http://en.wikipedia.org/wiki/Rotation_matrix#Axis_and_angle
-    """
-    x, y, z = axis
-    if not is_normalized:
-        n = jnp.sqrt(x * x + y * y + z * z)
-        x = x / n
-        y = y / n
-        z = z / n
-    c = jnp.cos(angle)
-    s = jnp.sin(angle)
-    C = 1 - c
-    xs = x * s
-    ys = y * s
-    zs = z * s
-    xC = x * C
-    yC = y * C
-    zC = z * C
-    xyC = x * yC
-    yzC = y * zC
-    zxC = z * xC
-    return jnp.array([
-        [x * xC + c, xyC - zs, zxC + ys],
-        [xyC + zs, y * yC + c, yzC - xs],
-        [zxC - ys, yzC + xs, z * zC + c],
-    ])
+import numpy as np
+import matplotlib.pylab as plt
+from matplotlib import cm
+from time import time
+import pyvista as pv
+from scipy.interpolate import RegularGridInterpolator
+from scipy.ndimage import map_coordinates
+from jax.image import resize
+from jax import jit
+import jax.numpy as jnp
+from functools import partial
+import trimesh
+import vtk
+import open3d as o3d
+import h5py
+
+import os
+import __main__
+
+
+@partial(jit, static_argnums=(1, 2))
+def downsample_field(field, factor, method="bicubic"):
+    """
+    Downsample a JAX array by a factor of `factor` along each axis.
+
+    Parameters
+    ----------
+    field : jax.numpy.ndarray
+        The input vector field to be downsampled. This should be a 3D or 4D JAX array where the last dimension is 2 or 3 (vector components).
+    factor : int
+        The factor by which to downsample the field. The dimensions of the field will be divided by this factor.
+    method : str, optional
+        The method to use for downsampling. Default is 'bicubic'.
+
+    Returns
+    -------
+    jax.numpy.ndarray
+        The downsampled field.
+    """
+    if factor == 1:
+        return field
+    else:
+        new_shape = tuple(dim // factor for dim in field.shape[:-1])
+        downsampled_components = []
+        for i in range(field.shape[-1]):  # Iterate over the last dimension (vector components)
+            resized = resize(field[..., i], new_shape, method=method)
+            downsampled_components.append(resized)
+
+        return jnp.stack(downsampled_components, axis=-1)
+
+
+def save_image(fld, timestep=None, prefix=None, **kwargs):
+    """
+    Save an image of a field at a given timestep.
+
+    Parameters
+    ----------
+    timestep : int
+        The timestep at which the field is being saved.
+    fld : jax.numpy.ndarray
+        The field to be saved. This should be a 2D or 3D JAX array. If the field is 3D, the magnitude of the field will be calculated and saved.
+    prefix : str, optional
+        A prefix to be added to the filename. The filename will be the name of the main script file by default.
+
+    Returns
+    -------
+    None
+
+    Notes
+    -----
+    This function saves the field as an image in the PNG format.
+    The filename is based on the name of the main script file, the provided prefix, and the timestep number.
+    If the field is 3D, the magnitude of the field is calculated and saved.
+    The image is saved with the 'nipy_spectral' colormap and the origin set to 'lower'.
+    """
+    if prefix is None:
+        fname = os.path.basename(__main__.__file__)
+        fname = os.path.splitext(fname)[0]
+    else:
+        fname = prefix
+
+    if timestep is not None:
+        fname = fname + "_" + str(timestep).zfill(4)
+
+    if len(fld.shape) > 3:
+        raise ValueError("The input field should be 2D!")
+    if len(fld.shape) == 3:
+        fld = np.sqrt(fld[0, ...] ** 2 + fld[0, ...] ** 2)
+
+    plt.clf()
+    kwargs.pop("cmap", None)
+    plt.imsave(fname + ".png", fld.T, cmap=cm.nipy_spectral, origin="lower", **kwargs)
+
+
+def save_fields_vtk(fields, timestep, output_dir=".", prefix="fields", shift_coords=(0, 0, 0), scale=1):
+    """
+    Save VTK fields to the specified directory, shifting the coordinates if needed.
+
+    Parameters
+    ----------
+    timestep (int): The timestep number to be associated with the saved fields.
+    fields (Dict[str, np.ndarray]): A dictionary of fields to be saved. Each field must be an array-like object
+        with dimensions (nx, ny) for 2D fields or (nx, ny, nz) for 3D fields, where:
+            - nx : int, number of grid points along the x-axis
+            - ny : int, number of grid points along the y-axis
+            - nz : int, number of grid points along the z-axis (for 3D fields only)
+        The key value for each field in the dictionary must be a string containing the name of the field.
+    output_dir (str, optional, default: '.'): The directory in which to save the VTK files. Defaults to the current directory.
+    prefix (str, optional, default: 'fields'): A prefix to be added to the filename. Defaults to 'fields'.
+    shift_coords (tuple, optional, default: (0, 0, 0)): The amount to shift in the x, y, and z directions.
+    scale (int, optional, default: 1): The amount to scale the geometry.
+
+    Returns
+    -------
+    None
+
+    Notes
+    -----
+    This function saves the VTK fields in the specified directory, with filenames based on the provided timestep number
+    and the filename. For example, if the timestep number is 10 and the file name is fields, the VTK file
+    will be saved as 'fields_0000010.vtk'in the specified directory.
+
+    """
+    start = time()
+    # Assert that all fields have the same dimensions
+    for key, value in fields.items():
+        if key == list(fields.keys())[0]:
+            dimensions = value.shape
+        else:
+            assert value.shape == dimensions, "All fields must have the same dimensions!"
+
+    output_filename = os.path.join(output_dir, prefix + "_" + f"{timestep:08d}.vtk")
+
+    # Add 1 to the dimensions tuple as we store cell values
+    dimensions = tuple([dim + 1 for dim in dimensions])
+
+    # Create a uniform grid
+    if value.ndim == 2:
+        dimensions = dimensions + (1,)
+
+    grid = pv.ImageData(dimensions=dimensions, origin=shift_coords, spacing=(scale, scale, scale))
+
+    # Add the fields to the grid
+    for key, value in fields.items():
+        grid[key] = value.flatten(order="F")
+
+    # Save the grid to a VTK file
+    grid.save(output_filename, binary=True)
+    print(f"Saved {output_filename} in {time() - start:.6f} seconds.")
+
+
+def map_field_vtk_interpolator(
+    field, stl_filename, voxel_size, output_dir=".", prefix="mapped_field", origin=[0, 0, 0], method="cubic", normals=True
+):
+    """
+    Map a volumetric field onto an STL mesh using RegularGridInterpolator.
+
+    Parameters
+    ----------
+    field : np.ndarray
+        3D array representing the volumetric field.
+    stl_filename : str
+        Path to the STL file.
+    voxel_size : float
+        Size of a voxel along each axis.
+    output_dir : str, optional
+        Directory to save the output VTK file.
+    prefix : str, optional
+        Filename prefix.
+    origin : list or tuple of float, optional
+        Origin of the grid.
+    method : str, optional
+        Interpolation method (e.g., 'cubic').
+    normals : bool, optional
+        If True, use normal-direction averaging by sampling points offset along the surface normal;
+        if False, simply sample the field at the surface points.
+
+    Returns
+    -------
+    None
+    """
+
+    print("Mapping field to stl with {} method".format("normal averaging" if normals else "original sampling"))
+    start = time()
+    grid_shape = field.shape
+
+    # Create coordinate arrays based on the origin and voxel size.
+    x = origin[0] + np.arange(grid_shape[0]) * voxel_size
+    y = origin[1] + np.arange(grid_shape[1]) * voxel_size
+    z = origin[2] + np.arange(grid_shape[2]) * voxel_size
+
+    # Set up the interpolation function.
+    interp_func = RegularGridInterpolator((x, y, z), field, method=method, bounds_error=False, fill_value=None)
+
+    # Load the STL mesh.
+    stl_mesh = pv.read(stl_filename)
+
+    if normals:
+        # Compute normals if not already available.
+        if "Normals" not in stl_mesh.point_data:
+            stl_mesh = stl_mesh.compute_normals()
+        normals_arr = stl_mesh.point_normals  # shape (N, 3)
+        points = stl_mesh.points  # shape (N, 3)
+
+        # Define offsets along the normal: sample 2 voxels in both directions including the surface.
+        offsets = np.array([-2, 2]) * voxel_size  # shape (5,)
+        # offsets = np.array([-2, -1, 0, 1, 2]) * voxel_size  # shape (5,)
+        # Generate sample points along the normal for each mesh point.
+        sample_points = points[:, np.newaxis, :] + offsets[np.newaxis, :, np.newaxis] * normals_arr[:, np.newaxis, :]
+        sample_points_reshaped = sample_points.reshape(-1, 3)
+
+        # Interpolate the field at each of the sample points.
+        field_values = interp_func(sample_points_reshaped)
+        field_values = field_values.reshape(points.shape[0], len(offsets))
+        # Average the values along the normal offset direction.
+        field_mapped = np.mean(field_values, axis=1)
+    else:
+        # Original: simply sample the field at the surface points.
+        points = stl_mesh.points
+        field_mapped = interp_func(points)
+
+    # Assign the mapped field to the mesh and save.
+    stl_mesh["field"] = field_mapped
+    output_filename = os.path.join(output_dir, prefix + ".vtk")
+    stl_mesh.save(output_filename)
+    print(f"Saved {output_filename} in {time() - start:.6f} seconds.")
+
+
+def map_field_vtk(field, stl_filename, output_dir=".", prefix="mapped_field", shift_coords=(0, 0, 0), scale=1, normals=True):
+    """
+    Save VTK fields to the specified directory by probing a uniform grid
+    generated from a field array onto an STL mesh. If normals is True, for
+    each STL point the field is averaged over points offset along the surface normal.
+
+    Parameters
+    ----------
+    field : np.ndarray
+        The field data (2D or 3D) to be mapped.
+    stl_filename : str
+        Path to the STL file.
+    output_dir : str, optional
+        Directory to save the output VTK file.
+    prefix : str, optional
+        Filename prefix.
+    shift_coords : tuple, optional
+        Origin (shift) for the uniform grid.
+    scale : int or float, optional
+        Spacing of the uniform grid.
+    normals : bool, optional
+        If True, average field values along the surface normal (sampling 2 voxels on either side);
+        if False, use the original probe method.
+
+    Returns
+    -------
+    None
+    """
+    start = time()
+    method_str = "normal averaging" if normals else "original sampling"
+    print(f"Mapping field to stl with {method_str}")
+    output_filename = os.path.join(output_dir, prefix + ".vtk")
+
+    # Create the uniform grid dimensions (note: cell values require dimensions + 1).
+    dimensions = tuple(dim + 1 for dim in field.shape)
+    if field.ndim == 2:
+        dimensions = dimensions + (1,)
+
+    # Create a uniform grid (ImageData) with the specified origin and spacing.
+    grid = pv.ImageData(dimensions=dimensions, origin=shift_coords, spacing=(scale, scale, scale))
+    grid.cell_data["field"] = field.flatten(order="F")
+    grid = grid.cell_data_to_point_data()
+
+    # Load the STL mesh.
+    stl_mesh = pv.read(stl_filename)
+
+    if normals:
+        # Compute normals if not available.
+        if "Normals" not in stl_mesh.point_data:
+            stl_mesh = stl_mesh.compute_normals()
+        normals_arr = stl_mesh.point_normals  # shape (N, 3)
+        points = stl_mesh.points  # shape (N, 3)
+
+        # Define offsets along the normal: sample 2 voxels in both directions.
+        offsets = np.array([-2, 2]) * scale
+        # offsets = np.array([-2, -1, 0, 1, 2]) * scale
+        # Generate sample points along the normal for each STL point.
+        sample_points = points[:, np.newaxis, :] + offsets[np.newaxis, :, np.newaxis] * normals_arr[:, np.newaxis, :]
+        sample_points_reshaped = sample_points.reshape(-1, 3)
+
+        # Create a PolyData object from these sample points.
+        samples_pd = pv.PolyData(sample_points_reshaped)
+
+        # Use vtkProbeFilter to sample the grid at these sample locations.
+        probe = vtk.vtkProbeFilter()
+        probe.SetInputData(samples_pd)
+        probe.SetSourceData(grid)
+        probe.Update()
+        sampled = pv.wrap(probe.GetOutput())
+        sample_field = sampled.point_data["field"]
+        averaged_field = np.mean(sample_field.reshape(-1, len(offsets)), axis=1)
+
+        # Assign the averaged field to the mesh.
+        stl_mesh["field"] = averaged_field
+        stl_mesh.save(output_filename)
+    else:
+        # Original method: use vtkProbeFilter on the STL geometry.
+        stl_vtk = stl_mesh.extract_geometry()
+        probe = vtk.vtkProbeFilter()
+        probe.SetInputData(stl_vtk)
+        probe.SetSourceData(grid)
+        probe.Update()
+        stl_mapped = pv.wrap(probe.GetOutput())
+        stl_mapped.save(output_filename)
+
+    print(f"Saved {output_filename} in {time() - start:.6f} seconds.")
+
+
+def save_BCs_vtk(timestep, BCs, gridInfo, output_dir="."):
+    """
+    Save boundary conditions as VTK format to the specified directory.
+
+    Parameters
+    ----------
+    timestep (int): The timestep number to be associated with the saved fields.
+    BCs (List[BC]): A list of boundary conditions to be saved. Each boundary condition must be an object of type BC.
+
+    Returns
+    -------
+    None
+
+    Notes
+    -----
+    This function saves the boundary conditions in the specified directory, with filenames based on the provided timestep number
+    and the filename. For example, if the timestep number is 10, the VTK file
+    will be saved as 'BCs_0000010.vtk'in the specified directory.
+    """
+
+    # Create a uniform grid
+    if gridInfo["nz"] == 0:
+        gridDimensions = (gridInfo["nx"] + 1, gridInfo["ny"] + 1, 1)
+        fieldDimensions = (gridInfo["nx"], gridInfo["ny"], 1)
+    else:
+        gridDimensions = (gridInfo["nx"] + 1, gridInfo["ny"] + 1, gridInfo["nz"] + 1)
+        fieldDimensions = (gridInfo["nx"], gridInfo["ny"], gridInfo["nz"])
+
+    grid = pv.ImageData(dimensions=gridDimensions)
+
+    # Dictionary to keep track of encountered BC names
+    bcNamesCount = {}
+
+    for bc in BCs:
+        bcName = bc.name
+        if bcName in bcNamesCount:
+            bcNamesCount[bcName] += 1
+        else:
+            bcNamesCount[bcName] = 0
+        bcName += f"_{bcNamesCount[bcName]}"
+
+        if bc.isDynamic:
+            bcIndices, _ = bc.update_function(timestep)
+        else:
+            bcIndices = bc.indices
+
+        # Convert indices to 1D indices
+        if gridInfo["dim"] == 2:
+            bcIndices = np.ravel_multi_index(bcIndices, fieldDimensions[:-1], order="F")
+        else:
+            bcIndices = np.ravel_multi_index(bcIndices, fieldDimensions, order="F")
+
+        grid[bcName] = np.zeros(fieldDimensions, dtype=bool).flatten(order="F")
+        grid[bcName][bcIndices] = True
+
+    # Save the grid to a VTK file
+    output_filename = os.path.join(output_dir, "BCs_" + f"{timestep:07d}.vtk")
+
+    start = time()
+    grid.save(output_filename, binary=True)
+    print(f"Saved {output_filename} in {time() - start:.6f} seconds.")
+
+
+def rotate_geometry(indices, origin, axis, angle):
+    """
+    Rotates a voxelized mesh around a given axis.
+
+    Parameters
+    ----------
+    indices : array-like
+        The indices of the voxels in the mesh.
+    origin : array-like
+        The coordinates of the origin of the rotation axis.
+    axis : array-like
+        The direction vector of the rotation axis. This should be a 3-element sequence.
+    angle : float
+        The angle by which to rotate the mesh, in radians.
+
+    Returns
+    -------
+    tuple
+        The indices of the voxels in the rotated mesh.
+
+    Notes
+    -----
+    This function rotates the mesh by applying a rotation matrix to the voxel indices. The rotation matrix is calculated
+    using the axis-angle representation of rotations. The origin of the rotation axis is assumed to be at (0, 0, 0).
+    """
+    indices_rotated = (jnp.array(indices).T - origin) @ axangle2mat(axis, angle) + origin
+    return tuple(jnp.rint(indices_rotated).astype("int32").T)
+
+
+def voxelize_stl(stl_filename, length_lbm_unit=None, tranformation_matrix=None, pitch=None):
+    """
+    Converts an STL file to a voxelized mesh.
+
+    Parameters
+    ----------
+    stl_filename : str
+        The name of the STL file to be voxelized.
+    length_lbm_unit : float, optional
+        The unit length in LBM. Either this or 'pitch' must be provided.
+    tranformation_matrix : array-like, optional
+        A transformation matrix to be applied to the mesh before voxelization.
+    pitch : float, optional
+        The pitch of the voxel grid. Either this or 'length_lbm_unit' must be provided.
+
+    Returns
+    -------
+    trimesh.VoxelGrid, float
+        The voxelized mesh and the pitch of the voxel grid.
+
+    Notes
+    -----
+    This function uses the trimesh library to load the STL file and voxelized the mesh. If a transformation matrix is
+    provided, it is applied to the mesh before voxelization. The pitch of the voxel grid is calculated based on the
+    maximum extent of the mesh and the provided lattice Boltzmann unit length, unless a pitch is provided directly.
+    """
+    if length_lbm_unit is None and pitch is None:
+        raise ValueError("Either 'length_lbm_unit' or 'pitch' must be provided!")
+    mesh = trimesh.load_mesh(stl_filename, process=False)
+    length_phys_unit = mesh.extents.max()
+    if tranformation_matrix is not None:
+        mesh.apply_transform(tranformation_matrix)
+    if pitch is None:
+        pitch = length_phys_unit / length_lbm_unit
+    mesh_voxelized = mesh.voxelized(pitch=pitch)
+    return mesh_voxelized, pitch
+
+
+def save_fields_hdf5(fields, timestep, output_dir=".", prefix="fields", shift_coords=(0, 0, 0), scale=1, compression="gzip", compression_opts=0):
+    start = time()
+    filename = str(prefix + "_" + f"{timestep:08d}.h5")
+    output_filename = os.path.join(output_dir, filename)
+
+    # Determine the dimensions (assuming all fields have the same shape)
+    for key, value in fields.items():
+        if key == list(fields.keys())[0]:
+            dimensions = value.shape
+        else:
+            assert value.shape == dimensions, "All fields must have the same dimensions!"
+
+    with h5py.File(output_filename, "w") as f:
+        # Write field data with Fortran order to match the VTK convention
+        for key, value in fields.items():
+            value = np.transpose(value, (2, 1, 0))
+
+            dataset = f.create_dataset(key, data=value, dtype="float32", compression=compression, compression_opts=compression_opts)
+            dataset.attrs["origin"] = shift_coords
+            dataset.attrs["spacing"] = (scale, scale, scale)
+
+    # Write the XDMF file using HyperSlab to properly reference the HDF5 data
+    xdmf_filename = os.path.join(output_dir, prefix + "_" + f"{timestep:08d}.xdmf")
+    with open(xdmf_filename, "w") as xdmf:
+        xdmf.write(f"""<?xml version="1.0" ?>
+<Xdmf Version="3.0" xmlns:xi="http://www.w3.org/2001/XInclude">
+  <Domain>
+    <Grid Name="fields" GridType="Uniform">
+      <Topology TopologyType="3DCoRectMesh" Dimensions="{dimensions[2] + 1} {dimensions[1] + 1} {dimensions[0] + 1}"/>
+      <Geometry GeometryType="ORIGIN_DXDYDZ">
+        <DataItem Dimensions="3" NumberType="Float" Precision="4" Format="XML">
+          {shift_coords[2]} {shift_coords[1]} {shift_coords[0]}
+        </DataItem>
+        <DataItem Dimensions="3" NumberType="Float" Precision="4" Format="XML">
+          {scale} {scale} {scale}
+        </DataItem>
+      </Geometry>
+""")
+        for key in fields.keys():
+            xdmf.write(f"""
+      <Attribute Name="{key}" AttributeType="Scalar" Center="Cell">
+        <DataItem ItemType="HyperSlab" Dimensions="{dimensions[2]} {dimensions[1]} {dimensions[0]}" NumberType="Float" Precision="4" Format="HDF">
+          <DataItem Dimensions="3 3" Format="XML">
+            0 0 0
+            1 1 1
+            {dimensions[2]} {dimensions[1]} {dimensions[0]}
+          </DataItem>
+          <DataItem Dimensions="{dimensions[2]} {dimensions[1]} {dimensions[0]}" NumberType="Float" Precision="4" Format="HDF">
+            {filename}:/{key}
+          </DataItem>
+        </DataItem>
+      </Attribute>
+""")
+        xdmf.write("""
+    </Grid>
+  </Domain>
+</Xdmf>
+""")
+
+    print(f"Saved {output_filename} and {xdmf_filename} in {time() - start:.6f} seconds.")
+
+
+def axangle2mat(axis, angle, is_normalized=False):
+    """Rotation matrix for rotation angle `angle` around `axis`
+    Parameters
+    ----------
+    axis : 3 element sequence
+       vector specifying axis for rotation.
+    angle : scalar
+       angle of rotation in radians.
+    is_normalized : bool, optional
+       True if `axis` is already normalized (has norm of 1).  Default False.
+    Returns
+    -------
+    mat : array shape (3,3)
+       rotation matrix for specified rotation
+    Notes
+    -----
+    From : https://github.com/matthew-brett/transforms3d
+    Ref : http://en.wikipedia.org/wiki/Rotation_matrix#Axis_and_angle
+    """
+    x, y, z = axis
+    if not is_normalized:
+        n = jnp.sqrt(x * x + y * y + z * z)
+        x = x / n
+        y = y / n
+        z = z / n
+    c = jnp.cos(angle)
+    s = jnp.sin(angle)
+    C = 1 - c
+    xs = x * s
+    ys = y * s
+    zs = z * s
+    xC = x * C
+    yC = y * C
+    zC = z * C
+    xyC = x * yC
+    yzC = y * zC
+    zxC = z * xC
+    return jnp.array([
+        [x * xC + c, xyC - zs, zxC + ys],
+        [xyC + zs, y * yC + c, yzC - xs],
+        [zxC - ys, yzC + xs, z * zC + c],
+    ])
+
+
+def voxelize_stl_open3d(stl_filename, length_lbm_unit):
+    # Load the STL file
+    mesh = o3d.io.read_triangle_mesh(stl_filename)
+    print("..Model read")
+    # Compute the voxel grid from the mesh
+    voxel_grid = o3d.geometry.VoxelGrid.create_from_triangle_mesh(mesh, voxel_size=length_lbm_unit)
+    print("...Grid created")
+    # Get the bounding box of the voxel grid
+    bbox = voxel_grid.get_axis_aligned_bounding_box()
+
+    # Calculate the number of voxels along each axis
+    grid_size = np.ceil((bbox.get_max_bound() - bbox.get_min_bound()) / length_lbm_unit).astype(int)
+
+    # Initialize an empty 3D array based on the calculated grid size
+    voxel_matrix = np.zeros(grid_size, dtype=bool)
+
+    # Convert voxel indices to a boolean matrix
+    for voxel in voxel_grid.get_voxels():
+        x, y, z = voxel.grid_index
+        voxel_matrix[x, y, z] = True
+
+    # Return the voxel matrix and the bounding box corners
+    return voxel_matrix, bbox.get_box_points()
+
+
+@partial(jit)
+def q_criterion(u, omega=2.0):
+    # Compute derivatives
+    u_x = u[0, ...]
+    u_y = u[1, ...]
+    u_z = u[2, ...]
+
+    # Compute derivatives
+    u_x_dx = (u_x[2:, 1:-1, 1:-1] - u_x[:-2, 1:-1, 1:-1]) / 2
+    u_x_dy = (u_x[1:-1, 2:, 1:-1] - u_x[1:-1, :-2, 1:-1]) / 2
+    u_x_dz = (u_x[1:-1, 1:-1, 2:] - u_x[1:-1, 1:-1, :-2]) / 2
+    u_y_dx = (u_y[2:, 1:-1, 1:-1] - u_y[:-2, 1:-1, 1:-1]) / 2
+    u_y_dy = (u_y[1:-1, 2:, 1:-1] - u_y[1:-1, :-2, 1:-1]) / 2
+    u_y_dz = (u_y[1:-1, 1:-1, 2:] - u_y[1:-1, 1:-1, :-2]) / 2
+    u_z_dx = (u_z[2:, 1:-1, 1:-1] - u_z[:-2, 1:-1, 1:-1]) / 2
+    u_z_dy = (u_z[1:-1, 2:, 1:-1] - u_z[1:-1, :-2, 1:-1]) / 2
+    u_z_dz = (u_z[1:-1, 1:-1, 2:] - u_z[1:-1, 1:-1, :-2]) / 2
+
+    # Compute vorticity
+    mu_x = u_z_dy - u_y_dz
+    mu_y = u_x_dz - u_z_dx
+    mu_z = u_y_dx - u_x_dy
+    norm_mu = jnp.sqrt(mu_x**2 + mu_y**2 + mu_z**2)
+
+    # Compute strain rate
+    s_0_0 = u_x_dx
+    s_0_1 = 0.5 * (u_x_dy + u_y_dx)
+    s_0_2 = 0.5 * (u_x_dz + u_z_dx)
+    s_1_0 = s_0_1
+    s_1_1 = u_y_dy
+    s_1_2 = 0.5 * (u_y_dz + u_z_dy)
+    s_2_0 = s_0_2
+    s_2_1 = s_1_2
+    s_2_2 = u_z_dz
+    s_dot_s = s_0_0**2 + s_0_1**2 + s_0_2**2 + s_1_0**2 + s_1_1**2 + s_1_2**2 + s_2_0**2 + s_2_1**2 + s_2_2**2
+
+    # Compute Viscosity from Omega
+    mu = ((1 / omega) - 0.5) / 3.0
+
+    # Compute shear stress components
+    tau_xy = 2 * mu * s_0_1
+    tau_xz = 2 * mu * s_0_2
+    tau_yz = 2 * mu * s_1_2
+
+    # Compute shear stress magnitude
+    tau_magnitude = jnp.sqrt(tau_xy**2 + tau_xz**2 + tau_yz**2)
+
+    # Compute omega
+    omega_0_0 = 0.0
+    omega_0_1 = 0.5 * (u_x_dy - u_y_dx)
+    omega_0_2 = 0.5 * (u_x_dz - u_z_dx)
+    omega_1_0 = -omega_0_1
+    omega_1_1 = 0.0
+    omega_1_2 = 0.5 * (u_y_dz - u_z_dy)
+    omega_2_0 = -omega_0_2
+    omega_2_1 = -omega_1_2
+    omega_2_2 = 0.0
+    omega_dot_omega = (
+        omega_0_0**2 + omega_0_1**2 + omega_0_2**2 + omega_1_0**2 + omega_1_1**2 + omega_1_2**2 + omega_2_0**2 + omega_2_1**2 + omega_2_2**2
+    )
+
+    # Compute q-criterion
+    q = 0.5 * (omega_dot_omega - s_dot_s)
+
+    # Pad outputs to match original shape
+    pad_width = ((1, 1), (1, 1), (1, 1))  # Add 1 voxel on each side in x, y, z
+    norm_mu = jnp.pad(norm_mu, pad_width, mode="constant", constant_values=0)
+    q = jnp.pad(q, pad_width, mode="constant", constant_values=0)
+    tau_xy = jnp.pad(tau_xy, pad_width, mode="constant", constant_values=0)
+    tau_xz = jnp.pad(tau_xz, pad_width, mode="constant", constant_values=0)
+    tau_yz = jnp.pad(tau_yz, pad_width, mode="constant", constant_values=0)
+    tau_magnitude = jnp.pad(tau_magnitude, pad_width, mode="constant", constant_values=0)
+
+    return norm_mu, q, tau_xy, tau_xz, tau_yz, tau_magnitude

From 91b8d0a8905c4f4c4b6ef4abba433551d1330071 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Thu, 12 Jun 2025 14:23:09 -0400
Subject: [PATCH 071/208] Added hybridBC to the Neon backend and MultiRes
 settings

---
 .../stl_flow_past_sphere_3d.py                | 17 ++++-
 .../bc_halfway_bounce_back.py                 |  6 +-
 xlb/operator/boundary_condition/bc_hybrid.py  | 75 +++++++++++++------
 .../boundary_condition/helper_functions_bc.py |  7 +-
 .../multires_boundary_masker.py               | 61 ++++++++++++---
 xlb/operator/stepper/nse_multires_stepper.py  | 16 ++--
 6 files changed, 132 insertions(+), 50 deletions(-)

diff --git a/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
index f71bbc07..33398a6c 100644
--- a/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
@@ -3,12 +3,21 @@
 from xlb.precision_policy import PrecisionPolicy
 from xlb.grid import multires_grid_factory
 from xlb.operator.stepper import MultiresIncompressibleNavierStokesStepper
-from xlb.operator.boundary_condition import FullwayBounceBackBC, HalfwayBounceBackBC, RegularizedBC, ExtrapolationOutflowBC, DoNothingBC, ZouHeBC
+from xlb.operator.boundary_condition import (
+    FullwayBounceBackBC,
+    HalfwayBounceBackBC,
+    RegularizedBC,
+    ExtrapolationOutflowBC,
+    DoNothingBC,
+    ZouHeBC,
+    HybridBC,
+)
 from xlb.utils import make_cuboid_mesh
 import neon
 import warp as wp
 import numpy as np
 import time
+from xlb.operator.boundary_masker import MeshVoxelizationMethod
 
 
 def generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part, grid_shape):
@@ -164,7 +173,11 @@ def bc_profile_warp(index: wp.vec3i):
 bc_walls = FullwayBounceBackBC(indices=walls)  # TODO: issues with halfway bounce back only here!
 # bc_outlet = ExtrapolationOutflowBC(indices=outlet)
 bc_outlet = DoNothingBC(indices=outlet)
-bc_sphere = HalfwayBounceBackBC(mesh_vertices=sphere)
+# bc_sphere = HalfwayBounceBackBC(mesh_vertices=sphere, voxelization_method=MeshVoxelizationMethod.AABB)
+bc_sphere = HybridBC(
+    bc_method="nonequilibrium_regularized", mesh_vertices=sphere, voxelization_method=MeshVoxelizationMethod.AABB, use_mesh_distance=True
+)
+
 boundary_conditions = [bc_walls, bc_left, bc_outlet, bc_sphere]
 
 # Configure the simulation relaxation time
diff --git a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
index 2410e1c8..145514ae 100644
--- a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
@@ -91,7 +91,7 @@ def __init__(
             prescribed_value = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)(prescribed_value)
 
             @wp.func
-            def prescribed_profile_warp(index: wp.vec3i, time: Any):
+            def prescribed_profile_warp(index: Any, time: Any):
                 return wp.vec3(prescribed_value[0], prescribed_value[1], prescribed_value[2])
 
             self.profile = prescribed_profile_warp
@@ -109,8 +109,8 @@ def jax_implementation(self, f_pre, f_post, bc_mask, missing_mask):
         )
 
     def _construct_warp(self):
-        # load helper functions
-        bc_helper = HelperFunctionsBC(velocity_set=self.velocity_set, precision_policy=self.precision_policy, compute_backend=self.compute_backend)
+        # load helper functions. Explicitly using the WARP backend for helper functions as it may also be called by the Neon backend.
+        bc_helper = HelperFunctionsBC(velocity_set=self.velocity_set, precision_policy=self.precision_policy, compute_backend=ComputeBackend.WARP)
 
         # Set local constants
         _opp_indices = self.velocity_set.opp_indices
diff --git a/xlb/operator/boundary_condition/bc_hybrid.py b/xlb/operator/boundary_condition/bc_hybrid.py
index 68593f6c..17e323b1 100644
--- a/xlb/operator/boundary_condition/bc_hybrid.py
+++ b/xlb/operator/boundary_condition/bc_hybrid.py
@@ -60,10 +60,14 @@ def __init__(
             voxelization_method,
         )
 
+        assert self.compute_backend == ComputeBackend.WARP or ComputeBackend.NEON
+        "This BC is currently not supported by JAX backend!"
+
         # Instantiate the operator for computing macroscopic values
-        self.macroscopic = Macroscopic()
-        self.zero_moment = ZeroMoment()
-        self.equilibrium = QuadraticEquilibrium()
+        # Explicitly using the WARP backend for these operators as they may also be called by the Neon backend.
+        self.macroscopic = Macroscopic(compute_backend=ComputeBackend.WARP)
+        self.zero_moment = ZeroMoment(compute_backend=ComputeBackend.WARP)
+        self.equilibrium = QuadraticEquilibrium(compute_backend=ComputeBackend.WARP)
 
         # This BC class accepts both constant prescribed values of velocity with keyword "prescribed_value" or
         # velocity profiles given by keyword "profile" which must be a callable function.
@@ -102,7 +106,7 @@ def __init__(
             prescribed_value = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)(prescribed_value)
 
             @wp.func
-            def prescribed_profile_warp(index: wp.vec3i, time: Any):
+            def prescribed_profile_warp(index: Any, time: Any):
                 return wp.vec3(prescribed_value[0], prescribed_value[1], prescribed_value[2])
 
             self.profile = prescribed_profile_warp
@@ -126,6 +130,14 @@ def prescribed_profile_warp(index: wp.vec3i, time: Any):
         if self.velocity_set.d == 2:
             raise NotImplementedError("This BC is not implemented in 2D!")
 
+        # Define BC helper functions. Explicitly using the WARP backend for helper functions as it may also be called by the Neon backend.
+        self.bc_helper = HelperFunctionsBC(
+            velocity_set=self.velocity_set,
+            precision_policy=self.precision_policy,
+            compute_backend=ComputeBackend.WARP,
+            distance_decoder_function=self._construct_distance_decoder_function(),
+        )
+
         # if indices is not None:
         #     # this BC would be limited to stationary boundaries
         #     # assert mesh_vertices is None
@@ -135,8 +147,6 @@ def prescribed_profile_warp(index: wp.vec3i, time: Any):
         #     if mesh_velocity_function is not None:
         #         # mesh is moving and/or deforming
 
-        assert self.compute_backend == ComputeBackend.WARP, "This BC is currently only implemented with the Warp backend!"
-
     @Operator.register_backend(ComputeBackend.JAX)
     @partial(jit, static_argnums=(0))
     def jax_implementation(self, f_pre, f_post, bc_mask, missing_mask):
@@ -144,19 +154,29 @@ def jax_implementation(self, f_pre, f_post, bc_mask, missing_mask):
         raise NotImplementedError(f"Operation {self.__class__.__name} not implemented in JAX!")
         return
 
-    def _construct_warp(self):
-        # load helper functions
-        bc_helper = HelperFunctionsBC(velocity_set=self.velocity_set, precision_policy=self.precision_policy, compute_backend=self.compute_backend)
-
-        # Set local variables and constants
-        _c = self.velocity_set.c
-        _q = self.velocity_set.q
-        _d = self.velocity_set.d
+    def _construct_distance_decoder_function(self):
+        """
+        Constructs the distance decoder function for this BC.
+        """
+        # Get the opposite indices for the velocity set
         _opp_indices = self.velocity_set.opp_indices
-        _f_vec = wp.vec(_q, dtype=self.compute_dtype)
-        _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
-        _u_wall = _u_vec(0.0, 0.0, 0.0) if _d == 3 else _u_vec(0.0, 0.0)
 
+        # Define the distance decoder function for this BC
+        if self.compute_backend == ComputeBackend.WARP:
+
+            @wp.func
+            def distance_decoder_function(f_1: Any, index: Any, direction: Any):
+                return f_1[_opp_indices[direction], index[0], index[1], index[2]]
+
+        elif self.compute_backend == ComputeBackend.NEON:
+
+            @wp.func
+            def distance_decoder_function(f_1_pn: Any, index: Any, direction: Any):
+                return wp.neon_read(f_1_pn, index, _opp_indices[direction])
+
+        return distance_decoder_function
+
+    def _construct_warp(self):
         # Construct the functionals for this BC
         @wp.func
         def hybrid_bounceback_regularized(
@@ -177,7 +197,7 @@ def hybrid_bounceback_regularized(
 
             # Apply interpolated bounceback first to find missing populations at the boundary
             u_wall = self.profile(index, timestep)
-            f_post = bc_helper.interpolated_bounceback(
+            f_post = self.bc_helper.interpolated_bounceback(
                 index,
                 _missing_mask,
                 f_0,
@@ -194,7 +214,7 @@ def hybrid_bounceback_regularized(
 
             # Regularize the resulting populations
             feq = self.equilibrium.warp_functional(rho, u)
-            f_post = bc_helper.regularize_fpop(f_post, feq)
+            f_post = self.bc_helper.regularize_fpop(f_post, feq)
             return f_post
 
         @wp.func
@@ -216,7 +236,7 @@ def hybrid_bounceback_grads(
 
             # Apply interpolated bounceback first to find missing populations at the boundary
             u_wall = self.profile(index, timestep)
-            f_post = bc_helper.interpolated_bounceback(
+            f_post = self.bc_helper.interpolated_bounceback(
                 index,
                 _missing_mask,
                 f_0,
@@ -232,7 +252,7 @@ def hybrid_bounceback_grads(
             rho, u = self.macroscopic.warp_functional(f_post)
 
             # Compute Grad's appriximation using full equation as in Eq (10) of Dorschner et al.
-            f_post = bc_helper.grads_approximate_fpop(_missing_mask, rho, u, f_post)
+            f_post = self.bc_helper.grads_approximate_fpop(_missing_mask, rho, u, f_post)
             return f_post
 
         @wp.func
@@ -254,7 +274,7 @@ def hybrid_nonequilibrium_regularized(
 
             # Apply interpolated bounceback first to find missing populations at the boundary
             u_wall = self.profile(index, timestep)
-            f_post = bc_helper.interpolated_nonequilibrium_bounceback(
+            f_post = self.bc_helper.interpolated_nonequilibrium_bounceback(
                 index,
                 _missing_mask,
                 f_0,
@@ -271,7 +291,7 @@ def hybrid_nonequilibrium_regularized(
 
             # Regularize the resulting populations
             feq = self.equilibrium.warp_functional(rho, u)
-            f_post = bc_helper.regularize_fpop(f_post, feq)
+            f_post = self.bc_helper.regularize_fpop(f_post, feq)
             return f_post
 
         if self.bc_method == "bounceback_regularized":
@@ -294,3 +314,12 @@ def warp_implementation(self, f_pre, f_post, bc_mask, _missing_mask):
             dim=f_pre.shape[1:],
         )
         return f_post
+
+    def _construct_neon(self):
+        functional, _ = self._construct_warp()
+        return functional, None
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(self, f_pre, f_post, bc_mask, missing_mask):
+        # rise exception as this feature is not implemented yet
+        raise NotImplementedError("This feature is not implemented in XLB with the NEON backend yet.")
diff --git a/xlb/operator/boundary_condition/helper_functions_bc.py b/xlb/operator/boundary_condition/helper_functions_bc.py
index eb9a7c28..86656e91 100644
--- a/xlb/operator/boundary_condition/helper_functions_bc.py
+++ b/xlb/operator/boundary_condition/helper_functions_bc.py
@@ -7,7 +7,7 @@
 
 
 class HelperFunctionsBC(object):
-    def __init__(self, velocity_set=None, precision_policy=None, compute_backend=None):
+    def __init__(self, velocity_set=None, precision_policy=None, compute_backend=None, distance_decoder_function=None):
         if compute_backend == ComputeBackend.JAX:
             raise ValueError("This helper class contains helper functions only for the WARP implementation of some BCs not JAX!")
 
@@ -15,6 +15,7 @@ def __init__(self, velocity_set=None, precision_policy=None, compute_backend=Non
         self.velocity_set = velocity_set or DefaultConfig.velocity_set
         self.precision_policy = precision_policy or DefaultConfig.default_precision_policy
         self.compute_backend = compute_backend or DefaultConfig.default_backend
+        self.distance_decoder_function = distance_decoder_function
 
         # Set the compute and Store dtypes
         compute_dtype = self.precision_policy.compute_precision.wp_dtype
@@ -242,7 +243,7 @@ def interpolated_bounceback(
                     # The normalized distance to the mesh or "weights" have been stored in known directions of f_1
                     if needs_mesh_distance:
                         # use weights associated with curved boundaries that are properly stored in f_1.
-                        weight = compute_dtype(f_1[_opp_indices[l], index[0], index[1], index[2]])
+                        weight = compute_dtype(self.distance_decoder_function(f_1, index, l))
                     else:
                         weight = compute_dtype(0.5)
 
@@ -288,7 +289,7 @@ def interpolated_nonequilibrium_bounceback(
                     # The normalized distance to the mesh or "weights" have been stored in known directions of f_1
                     if needs_mesh_distance:
                         # use weights associated with curved boundaries that are properly stored in f_1.
-                        weight = compute_dtype(f_1[_opp_indices[l], index[0], index[1], index[2]])
+                        weight = compute_dtype(self.distance_decoder_function(f_1, index, l))
                     else:
                         weight = compute_dtype(0.5)
 
diff --git a/xlb/operator/boundary_masker/multires_boundary_masker.py b/xlb/operator/boundary_masker/multires_boundary_masker.py
index 0de211ab..66bd5334 100644
--- a/xlb/operator/boundary_masker/multires_boundary_masker.py
+++ b/xlb/operator/boundary_masker/multires_boundary_masker.py
@@ -1,10 +1,17 @@
 import warp as wp
+import neon, typing, copy
 from xlb.compute_backend import ComputeBackend
 from xlb.operator.operator import Operator
 from xlb.grid import grid_factory
 from xlb.precision_policy import Precision
-from xlb.operator.boundary_masker import IndicesBoundaryMasker, MeshBoundaryMasker
-import neon, typing, copy
+from xlb.operator.boundary_masker import (
+    IndicesBoundaryMasker,
+    MeshVoxelizationMethod,
+    MeshMaskerAABB,
+    MeshMaskerRay,
+    MeshMaskerWinding,
+    MeshMaskerAABBFill,
+)
 
 
 class MultiresBoundaryMasker(Operator):
@@ -30,14 +37,9 @@ def __init__(
             precision_policy=precision_policy,
             compute_backend=ComputeBackend.WARP,
         )
-        self.mesh_masker = MeshBoundaryMasker(
-            velocity_set=velocity_set,
-            precision_policy=precision_policy,
-            compute_backend=ComputeBackend.WARP,
-        )
 
     @Operator.register_backend(ComputeBackend.NEON)
-    def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None, xlb_grid=None):
+    def neon_implementation(self, bclist, f_1, bc_mask, missing_mask, start_index=None, xlb_grid=None):
         # Ensure that this operator is called on multires grids
         assert bc_mask.get_grid().get_name() == "mGrid", f"Operation {self.__class__.__name} is only applicable to multi-resolution cases"
 
@@ -53,6 +55,10 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None, x
             grid_dense = grid_factory(grid_shape, compute_backend=ComputeBackend.WARP)
             missing_mask_warp = grid_dense.create_field(cardinality=self.velocity_set.q, dtype=Precision.UINT8)
             bc_mask_warp = grid_dense.create_field(cardinality=1, dtype=Precision.UINT8)
+            f_1_warp = grid_dense.create_field(cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision)
+
+            # Set local constants
+            lattice_central_index = self.velocity_set.center_index
 
             # create a new bclist for this level only
             bc_with_indices = []
@@ -66,20 +72,49 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None, x
                     bc_copy.mesh_vertices = copy.deepcopy(bc.mesh_vertices) / refinement
 
                     # call mesh masker for this bc at this level
-                    bc_mask_warp, missing_mask_warp = self.mesh_masker(bc_copy, bc_mask_warp, missing_mask_warp)
+                    if bc.voxelization_method is MeshVoxelizationMethod.AABB:
+                        mesh_masker = MeshMaskerAABB(
+                            velocity_set=self.velocity_set,
+                            precision_policy=self.precision_policy,
+                            compute_backend=ComputeBackend.WARP,
+                        )
+                    elif bc.voxelization_method is MeshVoxelizationMethod.RAY:
+                        mesh_masker = MeshMaskerRay(
+                            velocity_set=self.velocity_set,
+                            precision_policy=self.precision_policy,
+                            compute_backend=ComputeBackend.WARP,
+                        )
+                    elif bc.voxelization_method is MeshVoxelizationMethod.WINDING:
+                        mesh_masker = MeshMaskerWinding(
+                            velocity_set=self.velocity_set,
+                            precision_policy=self.precision_policy,
+                            compute_backend=ComputeBackend.WARP,
+                        )
+                    elif bc.voxelization_method is MeshVoxelizationMethod.AABB_FILL:
+                        mesh_masker = MeshMaskerAABBFill(
+                            velocity_set=self.velocity_set,
+                            precision_policy=self.precision_policy,
+                            compute_backend=ComputeBackend.WARP,
+                        )
+                    else:
+                        raise ValueError(f"Unsupported voxelization method: {bc.voxelization_method}")
+                    f_1_warp, bc_mask_warp, missing_mask_warp = mesh_masker(bc_copy, f_1_warp, bc_mask_warp, missing_mask_warp)
 
             # call indices masker for all BC's with indices at this level
             bc_mask_warp, missing_mask_warp = self.indices_masker(bc_with_indices, bc_mask_warp, missing_mask_warp, start_index)
 
             @neon.Container.factory(name="MultiresBoundaryMasker")
             def container(
+                f_1_warp: typing.Any,
                 bc_mask_warp: typing.Any,
                 missing_mask_warp: typing.Any,
+                f_1_field: typing.Any,
                 bc_mask_field: typing.Any,
                 missing_mask_field: typing.Any,
             ):
                 def loading_step(loader: neon.Loader):
                     loader.set_mres_grid(bc_mask_field.get_grid(), level)
+                    f_1_hdl = loader.get_mres_write_handle(f_1_field)
                     bc_mask_hdl = loader.get_mres_write_handle(bc_mask_field)
                     missing_mask_hdl = loader.get_mres_write_handle(missing_mask_field)
 
@@ -102,15 +137,19 @@ def masker(gridIdx: typing.Any):
                             is_missing = wp.uint8(missing_mask_warp[q, lx, ly, lz])
                             wp.neon_write(missing_mask_hdl, gridIdx, q, is_missing)
 
+                            if q != lattice_central_index and is_missing == wp.uint8(False):
+                                wp.neon_write(f_1_hdl, gridIdx, q, f_1_warp[q, lx, ly, lz])
+
                     loader.declare_kernel(masker)
 
                 return loading_step
 
-            c = container(bc_mask_warp, missing_mask_warp, bc_mask, missing_mask)
+            c = container(f_1_warp, bc_mask_warp, missing_mask_warp, f_1, bc_mask, missing_mask)
             c.run(0)
             wp.synchronize()
 
+            del f_1_warp
             del bc_mask_warp
             del missing_mask_warp
 
-        return bc_mask, missing_mask
+        return f_1, bc_mask, missing_mask
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 8035afda..ec69007c 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -82,9 +82,9 @@ def prepare_fields(self, rho, u, initializer=None):
         # f_0.export_vti("f0_eq_init.vti", "init_f0")
 
         # Process boundary conditions and update masks
-        bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, bc_mask, missing_mask, xlb_grid=self.grid)
+        f_1, bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, f_1, bc_mask, missing_mask, xlb_grid=self.grid)
         # Initialize auxiliary data if needed
-        f_0, f_1 = self._initialize_auxiliary_data(self.boundary_conditions, f_0, f_1, bc_mask, missing_mask)
+        f_1 = self._initialize_auxiliary_data(self.boundary_conditions, f_1, bc_mask, missing_mask)
         # bc_mask.update_host(0)
         bc_mask.update_host(0)
         f_0.update_host(0)
@@ -208,7 +208,7 @@ def compute(index: Any):
         return
 
     @classmethod
-    def _process_boundary_conditions(cls, boundary_conditions, bc_mask, missing_mask, xlb_grid=None):
+    def _process_boundary_conditions(cls, boundary_conditions, f_1, bc_mask, missing_mask, xlb_grid=None):
         """Process boundary conditions and update boundary masks."""
         # Check for boundary condition overlaps
         # TODO! check_bc_overlaps(boundary_conditions, DefaultConfig.velocity_set.d, DefaultConfig.default_backend)
@@ -220,19 +220,19 @@ def _process_boundary_conditions(cls, boundary_conditions, bc_mask, missing_mask
         )
 
         # Process all boundary conditions, either defined by indices or mesh_vertices
-        bc_mask, missing_mask = mres_masker(boundary_conditions, bc_mask, missing_mask, xlb_grid=xlb_grid)
+        f_1, bc_mask, missing_mask = mres_masker(boundary_conditions, f_1, bc_mask, missing_mask, xlb_grid=xlb_grid)
 
-        return bc_mask, missing_mask
+        return f_1, bc_mask, missing_mask
 
     @staticmethod
-    def _initialize_auxiliary_data(boundary_conditions, f_0, f_1, bc_mask, missing_mask):
+    def _initialize_auxiliary_data(boundary_conditions, f_1, bc_mask, missing_mask):
         """Initialize auxiliary data for boundary conditions that require it."""
         for bc in boundary_conditions:
             if bc.needs_aux_init and not bc.is_initialized_with_aux_data:
                 for level in range(bc_mask.get_grid().get_num_levels()):
                     # Initialize auxiliary data for each level
-                    f_0, f_1 = bc.multires_aux_data_init(f_0, f_1, bc_mask, missing_mask, level=level, stream=0)
-        return f_0, f_1
+                    f_1 = bc.multires_aux_data_init(f_1, bc_mask, missing_mask, level=level, stream=0)
+        return f_1
 
     def _construct_neon(self):
         # Set local constants

From 06773153b5771a7798b5b350b254a2f8b480b691 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 16 Jun 2025 14:33:55 -0400
Subject: [PATCH 072/208] addressed PR review comments.

---
 .../bc_extrapolation_outflow.py               |   8 +-
 .../bc_halfway_bounce_back.py                 |  24 +-
 xlb/operator/boundary_condition/bc_hybrid.py  |  72 +-
 xlb/operator/boundary_condition/bc_zouhe.py   |  22 +-
 .../boundary_condition/boundary_condition.py  |   6 +-
 .../boundary_condition/helper_functions_bc.py |   6 +-
 xlb/operator/boundary_masker/aabb.py          |  81 +-
 xlb/operator/boundary_masker/aabb_fill.py     |  79 +-
 .../boundary_masker/mesh_boundary_masker.py   |  26 +-
 xlb/operator/boundary_masker/ray.py           |  51 +-
 xlb/operator/boundary_masker/winding.py       |  72 +-
 xlb/operator/stepper/nse_stepper.py           |   4 +-
 xlb/utils/__init__.py                         |  23 +-
 xlb/utils/utils.py                            | 967 ++++++------------
 14 files changed, 467 insertions(+), 974 deletions(-)

diff --git a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
index 0f5b6128..c6e4eeb2 100644
--- a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
+++ b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
@@ -61,7 +61,7 @@ def __init__(
 
         # Unpack the two warp functionals needed for this BC!
         if self.compute_backend == ComputeBackend.WARP:
-            self.warp_functional, self.assemble_dynamic_data = self.warp_functional
+            self.warp_functional, self.assemble_auxiliary_data = self.warp_functional
 
     def _get_normal_vec(self, indices):
         # Get the frequency count and most common element directly
@@ -92,7 +92,7 @@ def _roll(self, fld, vec):
             return jnp.roll(fld, (vec[0], vec[1], vec[2]), axis=(1, 2, 3))
 
     @partial(jit, static_argnums=(0,), inline=True)
-    def assemble_dynamic_data(self, f_pre, f_post, bc_mask, missing_mask):
+    def assemble_auxiliary_data(self, f_pre, f_post, bc_mask, missing_mask):
         """
         Prepare time-dependent dynamic data for imposing the boundary condition in the next iteration after streaming.
         We use directions that leave the domain for storing this prepared data.
@@ -172,7 +172,7 @@ def functional(
             return _f
 
         @wp.func
-        def assemble_dynamic_data(
+        def assemble_auxiliary_data(
             index: Any,
             timestep: Any,
             missing_mask: Any,
@@ -200,7 +200,7 @@ def assemble_dynamic_data(
 
         kernel = self._construct_kernel(functional)
 
-        return (functional, assemble_dynamic_data), kernel
+        return (functional, assemble_auxiliary_data), kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(self, _f_pre, _f_post, bc_mask, missing_mask):
diff --git a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
index 72859c0d..cf4f4259 100644
--- a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
@@ -74,21 +74,19 @@ def __init__(
             if profile is not None:
                 raise ValueError("Cannot specify both profile and prescribed_value")
 
-            # Convert input to numpy array for validation
-            if isinstance(prescribed_value, (tuple, list)):
-                prescribed_value = np.array(prescribed_value, dtype=np.float64)
-            elif isinstance(prescribed_value, np.ndarray):
-                prescribed_value = prescribed_value.astype(np.float64)
-            elif isinstance(prescribed_value, (int, float)):
-                raise ValueError("Velocity prescribed_value must be a tuple or array")
-
-            # Validate prescribed value
-            if not isinstance(prescribed_value, np.ndarray):
-                raise ValueError("Velocity prescribed_value must be an array-like")
+            # Ensure prescribed_value is a NumPy array of floats
+            if isinstance(prescribed_value, (tuple, list, np.ndarray)):
+                prescribed_value = np.asarray(prescribed_value, dtype=np.float64)
+            else:
+                raise ValueError("Velocity prescribed_value must be a tuple, list, or array")
+
+            # Handle 2D velocity sets
+            if self.velocity_set.d == 2:
+                assert len(prescribed_value) == 2, "For 2D velocity set, prescribed_value must be a tuple or array of length 2!"
+                prescribed_value = np.array([prescribed_value[0], prescribed_value[1], 0.0], dtype=np.float64)
 
             # create a constant prescribed profile
-            # Note this BC class is only implemented in WARP.
-            prescribed_value = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)(prescribed_value)
+            prescribed_value = wp.vec(3, dtype=self.compute_dtype)(prescribed_value)
 
             @wp.func
             def prescribed_profile_warp(index: wp.vec3i, time: Any):
diff --git a/xlb/operator/boundary_condition/bc_hybrid.py b/xlb/operator/boundary_condition/bc_hybrid.py
index 68593f6c..35e1cac1 100644
--- a/xlb/operator/boundary_condition/bc_hybrid.py
+++ b/xlb/operator/boundary_condition/bc_hybrid.py
@@ -8,8 +8,7 @@
 from xlb.precision_policy import PrecisionPolicy
 from xlb.compute_backend import ComputeBackend
 from xlb.operator.operator import Operator
-from xlb.operator.macroscopic import Macroscopic, ZeroMoment
-from xlb.operator.macroscopic import SecondMoment as MomentumFlux
+from xlb.operator.macroscopic import Macroscopic
 from xlb.operator.equilibrium import QuadraticEquilibrium
 from xlb.operator.boundary_condition.boundary_condition import (
     ImplementationStep,
@@ -23,7 +22,7 @@ class HybridBC(BoundaryCondition):
     """
     The hybrid BC methods in this boundary condition have been originally developed by H. Salehipour and are inspired from
     various previous publications, in particular [1]. The reformulations are aimed to provide local formulations that are
-    computationally efficient and numerically stable at exessively large Reynolds numbers.
+    computationally efficient and numerically stable at high Reynolds numbers.
 
     [1] Dorschner, B., Chikatamarla, S. S., Bösch, F., & Karlin, I. V. (2015). Grad's approximation for moving and
         stationary walls in entropic lattice Boltzmann simulations. Journal of Computational Physics, 295, 340-354.
@@ -60,9 +59,11 @@ def __init__(
             voxelization_method,
         )
 
+        # Check if the compute backend is Warp
+        assert self.compute_backend == ComputeBackend.WARP, "This BC is currently only implemented with the Warp backend!"
+
         # Instantiate the operator for computing macroscopic values
         self.macroscopic = Macroscopic()
-        self.zero_moment = ZeroMoment()
         self.equilibrium = QuadraticEquilibrium()
 
         # This BC class accepts both constant prescribed values of velocity with keyword "prescribed_value" or
@@ -78,28 +79,26 @@ def __init__(
         # Handle no-slip BCs if neither prescribed_value or profile are provided.
         if prescribed_value is None and profile is None:
             print(f"WARNING! Assuming no-slip condition for BC type = {self.__class__.__name__}_{self.bc_method}!")
-            prescribed_value = [0] * self.velocity_set.d
+            prescribed_value = [0, 0, 0]
 
         # Handle prescribed value if provided
         if prescribed_value is not None:
             if profile is not None:
                 raise ValueError("Cannot specify both profile and prescribed_value")
 
-            # Convert input to numpy array for validation
-            if isinstance(prescribed_value, (tuple, list)):
-                prescribed_value = np.array(prescribed_value, dtype=np.float64)
-            elif isinstance(prescribed_value, np.ndarray):
-                prescribed_value = prescribed_value.astype(np.float64)
-            elif isinstance(prescribed_value, (int, float)):
-                raise ValueError("Velocity prescribed_value must be a tuple or array")
+            # Ensure prescribed_value is a NumPy array of floats
+            if isinstance(prescribed_value, (tuple, list, np.ndarray)):
+                prescribed_value = np.asarray(prescribed_value, dtype=np.float64)
+            else:
+                raise ValueError("Velocity prescribed_value must be a tuple, list, or array")
 
-            # Validate prescribed value
-            if not isinstance(prescribed_value, np.ndarray):
-                raise ValueError("Velocity prescribed_value must be an array-like")
+            # Handle 2D velocity sets
+            if self.velocity_set.d == 2:
+                assert len(prescribed_value) == 2, "For 2D velocity set, prescribed_value must be a tuple or array of length 2!"
+                prescribed_value = np.array([prescribed_value[0], prescribed_value[1], 0.0], dtype=np.float64)
 
             # create a constant prescribed profile
-            # Note this BC class is only implemented in WARP.
-            prescribed_value = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)(prescribed_value)
+            prescribed_value = wp.vec(3, dtype=self.compute_dtype)(prescribed_value)
 
             @wp.func
             def prescribed_profile_warp(index: wp.vec3i, time: Any):
@@ -120,43 +119,20 @@ def prescribed_profile_warp(index: wp.vec3i, time: Any):
         if self.mesh_vertices is None:
             assert self.indices is not None
             assert self.needs_mesh_distance is False, 'To use mesh distance, please provide the mesh vertices using keyword "mesh_vertices"!'
+            assert self.voxelization_method is None, "Voxelization method is only applicable when using mesh vertices!"
             self.needs_padding = True
-
-        # Raise error if used for 2d examples:
-        if self.velocity_set.d == 2:
-            raise NotImplementedError("This BC is not implemented in 2D!")
-
-        # if indices is not None:
-        #     # this BC would be limited to stationary boundaries
-        #     # assert mesh_vertices is None
-        # if mesh_vertices is not None:
-        #     # this BC would be applicable for stationary and moving boundaries
-        #     assert indices is None
-        #     if mesh_velocity_function is not None:
-        #         # mesh is moving and/or deforming
-
-        assert self.compute_backend == ComputeBackend.WARP, "This BC is currently only implemented with the Warp backend!"
+        else:
+            assert self.indices is None, "Cannot use indices with mesh vertices! Please provide mesh vertices only."
 
     @Operator.register_backend(ComputeBackend.JAX)
     @partial(jit, static_argnums=(0))
     def jax_implementation(self, f_pre, f_post, bc_mask, missing_mask):
-        # TODO
-        raise NotImplementedError(f"Operation {self.__class__.__name} not implemented in JAX!")
-        return
+        raise NotImplementedError(f"Operation {self.__class__.__name__} not implemented in JAX!")
 
     def _construct_warp(self):
         # load helper functions
         bc_helper = HelperFunctionsBC(velocity_set=self.velocity_set, precision_policy=self.precision_policy, compute_backend=self.compute_backend)
 
-        # Set local variables and constants
-        _c = self.velocity_set.c
-        _q = self.velocity_set.q
-        _d = self.velocity_set.d
-        _opp_indices = self.velocity_set.opp_indices
-        _f_vec = wp.vec(_q, dtype=self.compute_dtype)
-        _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
-        _u_wall = _u_vec(0.0, 0.0, 0.0) if _d == 3 else _u_vec(0.0, 0.0)
-
         # Construct the functionals for this BC
         @wp.func
         def hybrid_bounceback_regularized(
@@ -172,7 +148,7 @@ def hybrid_bounceback_regularized(
             # missing data in lattice Boltzmann.
             # [1] Latt, J., Chopard, B., Malaspinas, O., Deville, M., Michler, A., 2008. Straight velocity
             #     boundaries in the lattice Boltzmann method. Physical Review E 77, 056703.
-            # [2] Yu, D., Mei, R., Shyy, W., 2003. A uniﬁed boundary treatment in lattice boltzmann method,
+            # [2] Yu, D., Mei, R., Shyy, W., 2003. A unified boundary treatment in lattice boltzmann method,
             #     in: 41st aerospace sciences meeting and exhibit, p. 953.
 
             # Apply interpolated bounceback first to find missing populations at the boundary
@@ -211,7 +187,7 @@ def hybrid_bounceback_grads(
             # missing data in lattice Boltzmann.
             # [1] Dorschner, B., Chikatamarla, S. S., Bösch, F., & Karlin, I. V. (2015). Grad's approximation for moving and
             #    stationary walls in entropic lattice Boltzmann simulations. Journal of Computational Physics, 295, 340-354.
-            # [2] Yu, D., Mei, R., Shyy, W., 2003. A uniﬁed boundary treatment in lattice boltzmann method,
+            # [2] Yu, D., Mei, R., Shyy, W., 2003. A unified boundary treatment in lattice boltzmann method,
             #     in: 41st aerospace sciences meeting and exhibit, p. 953.
 
             # Apply interpolated bounceback first to find missing populations at the boundary
@@ -231,7 +207,7 @@ def hybrid_bounceback_grads(
             # Compute density, velocity using all f_post-streaming values
             rho, u = self.macroscopic.warp_functional(f_post)
 
-            # Compute Grad's appriximation using full equation as in Eq (10) of Dorschner et al.
+            # Compute Grad's approximation using full equation as in Eq (10) of Dorschner et al.
             f_post = bc_helper.grads_approximate_fpop(_missing_mask, rho, u, f_post)
             return f_post
 
@@ -246,7 +222,7 @@ def hybrid_nonequilibrium_regularized(
             f_post: Any,
         ):
             # This boundary condition uses the method of Tao et al (2018) [1] to get unknown populations on curved boundaries (denoted here by
-            # interpolated_nonequilibrium_bounceback method). To further stabalize this BC, we add regularization technique of [2].
+            # interpolated_nonequilibrium_bounceback method). To further stabilize this BC, we add regularization technique of [2].
             # [1] Tao, Shi, et al. "One-point second-order curved boundary condition for lattice Boltzmann simulation of suspended particles."
             #     Computers & Mathematics with Applications 76.7 (2018): 1593-1607.
             # [2] Latt, J., Chopard, B., Malaspinas, O., Deville, M., Michler, A., 2008. Straight velocity
diff --git a/xlb/operator/boundary_condition/bc_zouhe.py b/xlb/operator/boundary_condition/bc_zouhe.py
index 992c24a4..131c779b 100644
--- a/xlb/operator/boundary_condition/bc_zouhe.py
+++ b/xlb/operator/boundary_condition/bc_zouhe.py
@@ -73,21 +73,17 @@ def __init__(
             if profile is not None:
                 raise ValueError("Cannot specify both profile and prescribed_value")
 
-            # Convert input to numpy array for validation
-            if isinstance(prescribed_value, (tuple, list)):
-                prescribed_value = np.array(prescribed_value, dtype=np.float64)
-            elif isinstance(prescribed_value, (int, float)):
-                if bc_type == "pressure":
+            # Ensure prescribed_value is a NumPy array of floats
+            if bc_type == "velocity":
+                if isinstance(prescribed_value, (tuple, list, np.ndarray)):
+                    prescribed_value = np.asarray(prescribed_value, dtype=np.float64)
+                else:
+                    raise ValueError("Velocity prescribed_value must be a tuple, list, or array-like")
+            elif bc_type == "pressure":
+                if isinstance(prescribed_value, (int, float)):
                     prescribed_value = float(prescribed_value)
                 else:
-                    raise ValueError("Velocity prescribed_value must be a tuple or array")
-            elif isinstance(prescribed_value, np.ndarray):
-                prescribed_value = prescribed_value.astype(np.float64)
-
-            # Validate prescribed value
-            if bc_type == "velocity":
-                if not isinstance(prescribed_value, np.ndarray):
-                    raise ValueError("Velocity prescribed_value must be an array-like")
+                    raise ValueError("Pressure prescribed_value must be a scalar (int or float)")
 
                 # Check for non-zero elements - only one element should be non-zero
                 non_zero_count = np.count_nonzero(prescribed_value)
diff --git a/xlb/operator/boundary_condition/boundary_condition.py b/xlb/operator/boundary_condition/boundary_condition.py
index d86e989b..b6082ae2 100644
--- a/xlb/operator/boundary_condition/boundary_condition.py
+++ b/xlb/operator/boundary_condition/boundary_condition.py
@@ -84,7 +84,7 @@ def __init__(
             _missing_mask_vec = wp.vec(self.velocity_set.q, dtype=wp.uint8)  # TODO fix vec bool
 
         @wp.func
-        def assemble_dynamic_data(
+        def assemble_auxiliary_data(
             index: Any,
             timestep: Any,
             missing_mask: Any,
@@ -97,10 +97,10 @@ def assemble_dynamic_data(
 
         # Construct some helper warp functions for getting tid data
         if self.compute_backend == ComputeBackend.WARP:
-            self.assemble_dynamic_data = assemble_dynamic_data
+            self.assemble_auxiliary_data = assemble_auxiliary_data
 
     @partial(jit, static_argnums=(0,), inline=True)
-    def assemble_dynamic_data(self, f_pre, f_post, bc_mask, missing_mask):
+    def assemble_auxiliary_data(self, f_pre, f_post, bc_mask, missing_mask):
         """
         A placeholder function for prepare the auxiliary distribution functions for the boundary condition.
         currently being called after collision only.
diff --git a/xlb/operator/boundary_condition/helper_functions_bc.py b/xlb/operator/boundary_condition/helper_functions_bc.py
index 0782ac96..67fa0525 100644
--- a/xlb/operator/boundary_condition/helper_functions_bc.py
+++ b/xlb/operator/boundary_condition/helper_functions_bc.py
@@ -1,9 +1,9 @@
+import warp as wp
+from typing import Any
 from xlb import DefaultConfig, ComputeBackend
 from xlb.operator.macroscopic import SecondMoment as MomentumFlux
 from xlb.operator.macroscopic import Macroscopic
 from xlb.operator.equilibrium import QuadraticEquilibrium
-import warp as wp
-from typing import Any
 
 
 class HelperFunctionsBC(object):
@@ -211,7 +211,7 @@ def interpolated_bounceback(
             # A local single-node version of the interpolated bounce-back boundary condition due to Bouzidi for a lattice
             # Boltzmann method simulation.
             # Ref:
-            # [1] Yu, D., Mei, R., Shyy, W., 2003. A uniﬁed boundary treatment in lattice boltzmann method,
+            # [1] Yu, D., Mei, R., Shyy, W., 2003. A unified boundary treatment in lattice boltzmann method,
             # in: 41st aerospace sciences meeting and exhibit, p. 953.
 
             one = compute_dtype(1.0)
diff --git a/xlb/operator/boundary_masker/aabb.py b/xlb/operator/boundary_masker/aabb.py
index 62297288..b5cc6949 100644
--- a/xlb/operator/boundary_masker/aabb.py
+++ b/xlb/operator/boundary_masker/aabb.py
@@ -9,7 +9,11 @@
 
 class MeshMaskerAABB(MeshBoundaryMasker):
     """
-    Operator for creating a boundary missing_mask from an STL file
+    Operator for creating boundary missing_mask from mesh using Axis-Aligned Bounding Box (AABB) voxelization.
+
+    This implementation uses warp.mesh_query_aabb for efficient mesh-voxel intersection testing,
+    providing approximate 1-voxel thick surface detection around the mesh geometry.
+    Suitable for scenarios where fast, approximate boundary detection is sufficient.
     """
 
     def __init__(
@@ -27,47 +31,14 @@ def _construct_warp(self):
         _q = self.velocity_set.q
         _opp_indices = self.velocity_set.opp_indices
 
-        # Do voxelization mesh query (warp.mesh_query_aabb) to find solid voxels
-        #  - this gives an approximate 1 voxel thick surface around mesh
         @wp.kernel
         def kernel(
-            mesh_id: wp.uint64,
-            id_number: wp.int32,
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
-        ):
-            # get index
-            i, j, k = wp.tid()
-
-            # Get local indices
-            index = wp.vec3i(i, j, k)
-
-            # position of the point
-            pos_bc_cell = self.index_to_position(index)
-            half = wp.vec3(0.5, 0.5, 0.5)
-
-            if bc_mask[0, index[0], index[1], index[2]] == wp.uint8(255) or self.mesh_voxel_intersect(mesh_id=mesh_id, low=pos_bc_cell - half):
-                # Make solid voxel
-                bc_mask[0, index[0], index[1], index[2]] = wp.uint8(255)
-            else:
-                # Find the boundary voxels and their missing directions
-                for l in range(1, _q):
-                    _dir = wp.vec3f(wp.float32(_c[0, l]), wp.float32(_c[1, l]), wp.float32(_c[2, l]))
-
-                    # Check to see if this neighbor is solid - this is super inefficient TODO: make it way better
-                    if self.mesh_voxel_intersect(mesh_id=mesh_id, low=pos_bc_cell + _dir - half):
-                        # We know we have a solid neighbor
-                        # Set the boundary id and missing_mask
-                        bc_mask[0, index[0], index[1], index[2]] = wp.uint8(id_number)
-                        missing_mask[_opp_indices[l], index[0], index[1], index[2]] = True
-
-        @wp.kernel
-        def kernel_with_distance(
             mesh_id: wp.uint64,
             id_number: wp.int32,
             distances: wp.array4d(dtype=Any),
             bc_mask: wp.array4d(dtype=wp.uint8),
             missing_mask: wp.array4d(dtype=wp.bool),
+            needs_mesh_distance: bool,
         ):
             # get index
             i, j, k = wp.tid()
@@ -76,43 +47,47 @@ def kernel_with_distance(
             index = wp.vec3i(i, j, k)
 
             # position of the point
-            pos_bc_cell = self.index_to_position(index)
-            half = wp.vec3(0.5, 0.5, 0.5)
+            cell_center_pos = self.index_to_position(index)
+            HALF_VOXEL = wp.vec3(0.5, 0.5, 0.5)
 
-            if bc_mask[0, index[0], index[1], index[2]] == wp.uint8(255) or self.mesh_voxel_intersect(mesh_id=mesh_id, low=pos_bc_cell - half):
+            if bc_mask[0, index[0], index[1], index[2]] == wp.uint8(255) or self.mesh_voxel_intersect(
+                mesh_id=mesh_id, low=cell_center_pos - HALF_VOXEL
+            ):
                 # Make solid voxel
                 bc_mask[0, index[0], index[1], index[2]] = wp.uint8(255)
             else:
                 # Find the boundary voxels and their missing directions
-                for l in range(1, _q):
-                    _dir = wp.vec3f(wp.float32(_c[0, l]), wp.float32(_c[1, l]), wp.float32(_c[2, l]))
+                for direction_idx in range(1, _q):
+                    direction_vec = wp.vec3f(wp.float32(_c[0, direction_idx]), wp.float32(_c[1, direction_idx]), wp.float32(_c[2, direction_idx]))
 
-                    # Check to see if this neighbor is solid - this is super inefficient TODO: make it way better
-                    if self.mesh_voxel_intersect(mesh_id=mesh_id, low=pos_bc_cell + _dir - half):
+                    # Check to see if this neighbor is solid
+                    if self.mesh_voxel_intersect(mesh_id=mesh_id, low=cell_center_pos + direction_vec - HALF_VOXEL):
                         # We know we have a solid neighbor
                         # Set the boundary id and missing_mask
                         bc_mask[0, index[0], index[1], index[2]] = wp.uint8(id_number)
-                        missing_mask[_opp_indices[l], index[0], index[1], index[2]] = True
+                        missing_mask[_opp_indices[direction_idx], index[0], index[1], index[2]] = True
+
+                        # If we don't need the mesh distance, we can return early
+                        if not needs_mesh_distance:
+                            continue
 
                         # Find the fractional distance to the mesh in each direction
                         # We increase max_length to find intersections in neighboring cells
-                        max_length = wp.length(_dir)
-                        query = wp.mesh_query_ray(mesh_id, pos_bc_cell, _dir / max_length, 1.5 * max_length)
+                        max_length = wp.length(direction_vec)
+                        query = wp.mesh_query_ray(mesh_id, cell_center_pos, direction_vec / max_length, 1.5 * max_length)
                         if query.result:
                             # get position of the mesh triangle that intersects with the ray
                             pos_mesh = wp.mesh_eval_position(mesh_id, query.face, query.u, query.v)
                             # We reduce the distance to give some wall thickness
-                            dist = wp.length(pos_mesh - pos_bc_cell) - 0.5 * max_length
+                            dist = wp.length(pos_mesh - cell_center_pos) - 0.5 * max_length
                             weight = self.store_dtype(dist / max_length)
-                            distances[l, index[0], index[1], index[2]] = weight
-                            # if weight <= 0.0 or weight > 1.0:
-                            #     wp.printf("Got bad weight %f at %d,%d,%d\n", weight, index[0], index[1], index[2])
+                            distances[direction_idx, index[0], index[1], index[2]] = weight
                         else:
-                            # We didn't have an intersection in the given direction but we know we should so we assume the solid is slightly thicker
-                            # and one lattice direction away from the BC voxel
-                            distances[l, index[0], index[1], index[2]] = self.store_dtype(1.0)
+                            # Expected an intersection in this direction but none was found.
+                            # Assume the solid extends one lattice unit beyond the BC voxel leading to a distance fraction of 1.
+                            distances[direction_idx, index[0], index[1], index[2]] = self.store_dtype(1.0)
 
-        return None, [kernel, kernel_with_distance]
+        return None, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(
diff --git a/xlb/operator/boundary_masker/aabb_fill.py b/xlb/operator/boundary_masker/aabb_fill.py
index 1460f012..d4b47b5b 100644
--- a/xlb/operator/boundary_masker/aabb_fill.py
+++ b/xlb/operator/boundary_masker/aabb_fill.py
@@ -87,50 +87,23 @@ def kernel_solid(
             index = wp.vec3i(i, j, k)
 
             # position of the point
-            pos_bc_cell = self.index_to_position(index) + offset
+            cell_center_pos = self.index_to_position(index) + offset
             half = wp.vec3(0.5, 0.5, 0.5)
 
-            if self.mesh_voxel_intersect(mesh_id=mesh_id, low=pos_bc_cell - half):
+            if self.mesh_voxel_intersect(mesh_id=mesh_id, low=cell_center_pos - half):
                 # Make solid voxel
                 solid_mask[index[0], index[1], index[2]] = wp.int32(255)
 
-        # Assign the bc_mask based on the solid_mask we already computed
-        @wp.kernel
-        def kernel(
-            mesh_id: wp.uint64,
-            id_number: wp.int32,
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
-            solid_mask: wp.array3d(dtype=wp.uint8),
-        ):
-            # get index
-            i, j, k = wp.tid()
-
-            # Get local indices
-            index = wp.vec3i(i, j, k)
-
-            if solid_mask[i, j, k] == wp.uint8(255):
-                # Make solid voxel
-                bc_mask[0, index[0], index[1], index[2]] = wp.uint8(255)
-            else:
-                # Find the boundary voxels and their missing directions
-                for l in range(1, _q):
-                    # Check to see if this neighbor is solid - this is super inefficient TODO: make it way better
-                    if solid_mask[i + _c[0, l], j + _c[1, l], k + _c[2, l]] == wp.uint8(255):
-                        # We know we have a solid neighbor
-                        # Set the boundary id and missing_mask
-                        bc_mask[0, index[0], index[1], index[2]] = wp.uint8(id_number)
-                        missing_mask[_opp_indices[l], index[0], index[1], index[2]] = True
-
         # Assign the bc_mask and distances based on the solid_mask we already computed
         @wp.kernel
-        def kernel_with_distance(
+        def kernel(
             mesh_id: wp.uint64,
             id_number: wp.int32,
             distances: wp.array4d(dtype=Any),
             bc_mask: wp.array4d(dtype=wp.uint8),
             missing_mask: wp.array4d(dtype=wp.bool),
             solid_mask: wp.array3d(dtype=wp.uint8),
+            needs_mesh_distance: bool,
         ):
             # get index
             i, j, k = wp.tid()
@@ -139,45 +112,46 @@ def kernel_with_distance(
             index = wp.vec3i(i, j, k)
 
             # position of the point
-            pos_bc_cell = self.index_to_position(index)
+            cell_center_pos = self.index_to_position(index)
 
             if solid_mask[i, j, k] == wp.uint8(255) or bc_mask[0, index[0], index[1], index[2]] == wp.uint8(255):
                 # Make solid voxel
                 bc_mask[0, index[0], index[1], index[2]] = wp.uint8(255)
             else:
                 # Find the boundary voxels and their missing directions
-                for l in range(1, _q):
-                    _dir = wp.vec3f(wp.float32(_c[0, l]), wp.float32(_c[1, l]), wp.float32(_c[2, l]))
+                for direction_idx in range(1, _q):
+                    direction_vec = wp.vec3f(wp.float32(_c[0, direction_idx]), wp.float32(_c[1, direction_idx]), wp.float32(_c[2, direction_idx]))
 
                     # Check to see if this neighbor is solid - this is super inefficient TODO: make it way better
                     # if solid_mask[i,j,k] == wp.uint8(255):
-                    if solid_mask[i + _c[0, l], j + _c[1, l], k + _c[2, l]] == wp.uint8(255):
+                    if solid_mask[i + _c[0, direction_idx], j + _c[1, direction_idx], k + _c[2, direction_idx]] == wp.uint8(255):
                         # We know we have a solid neighbor
                         # Set the boundary id and missing_mask
                         bc_mask[0, index[0], index[1], index[2]] = wp.uint8(id_number)
-                        missing_mask[_opp_indices[l], index[0], index[1], index[2]] = True
+                        missing_mask[_opp_indices[direction_idx], index[0], index[1], index[2]] = True
+
+                        # If we don't need the mesh distance, we can return early
+                        if not needs_mesh_distance:
+                            continue
 
                         # Find the fractional distance to the mesh in each direction
                         # We increase max_length to find intersections in neighboring cells
-                        max_length = wp.length(_dir)
-                        query = wp.mesh_query_ray(mesh_id, pos_bc_cell, _dir / max_length, 1.5 * max_length)
+                        max_length = wp.length(direction_vec)
+                        query = wp.mesh_query_ray(mesh_id, cell_center_pos, direction_vec / max_length, 1.5 * max_length)
                         if query.result:
                             # get position of the mesh triangle that intersects with the ray
                             pos_mesh = wp.mesh_eval_position(mesh_id, query.face, query.u, query.v)
                             # We reduce the distance to give some wall thickness
-                            dist = wp.length(pos_mesh - pos_bc_cell) - 0.5 * max_length
+                            dist = wp.length(pos_mesh - cell_center_pos) - 0.5 * max_length
                             weight = self.store_dtype(dist / max_length)
-                            distances[l, index[0], index[1], index[2]] = weight
-                            # if weight <= 0.0 or weight > 1.0:
-                            #     wp.printf("Got bad weight %f at %d,%d,%d\n", weight, index[0], index[1], index[2])
+                            distances[direction_idx, index[0], index[1], index[2]] = weight
                         else:
                             # We didn't have an intersection in the given direction but we know we should so we assume the solid is slightly thicker
                             # and one lattice direction away from the BC voxel
-                            distances[l, index[0], index[1], index[2]] = self.store_dtype(1.0)
+                            distances[direction_idx, index[0], index[1], index[2]] = self.store_dtype(1.0)
 
         kernel_dict = {
             "kernel": kernel,
-            "kernel_with_distance": kernel_with_distance,
             "kernel_solid": kernel_solid,
             "erode_tile": erode_tile,
             "dilate_tile": dilate_tile,
@@ -259,18 +233,11 @@ def warp_implementation(
         )
 
         # Launch the main kernel for boundary masker
-        if bc.needs_mesh_distance:
-            wp.launch(
-                kernel_dict["kernel_with_distance"],
-                inputs=[mesh_id, bc_id, distances, bc_mask, missing_mask, solid_mask_cropped],
-                dim=bc_mask.shape[1:],
-            )
-        else:
-            wp.launch(
-                kernel_dict["kernel"],
-                inputs=[mesh_id, bc_id, bc_mask, missing_mask, solid_mask_cropped],
-                dim=bc_mask.shape[1:],
-            )
+        wp.launch(
+            kernel_dict["kernel"],
+            inputs=[mesh_id, bc_id, distances, bc_mask, missing_mask, solid_mask_cropped, wp.static(bc.needs_mesh_distance)],
+            dim=bc_mask.shape[1:],
+        )
 
         # Resolve out of bound indices
         wp.launch(
diff --git a/xlb/operator/boundary_masker/mesh_boundary_masker.py b/xlb/operator/boundary_masker/mesh_boundary_masker.py
index 1bcc2000..fa7a6bc6 100644
--- a/xlb/operator/boundary_masker/mesh_boundary_masker.py
+++ b/xlb/operator/boundary_masker/mesh_boundary_masker.py
@@ -187,13 +187,7 @@ def jax_implementation(
         bc_mask,
         missing_mask,
     ):
-        raise NotImplementedError(f"Operation {self.__class__.__name} not implemented in JAX!")
-        # Use Warp backend even for this particular operation.
-        wp.init()
-        bc_mask = wp.from_jax(bc_mask)
-        missing_mask = wp.from_jax(missing_mask)
-        bc_mask, missing_mask = self.warp_implementation(bc, bc_mask, missing_mask)
-        return wp.to_jax(bc_mask), wp.to_jax(missing_mask)
+        raise NotImplementedError(f"Operation {self.__class__.__name__} not implemented in JAX!")
 
     def warp_implementation_base(
         self,
@@ -230,19 +224,11 @@ def warp_implementation_base(
         bc_id = bc.id
 
         # Launch the appropriate warp kernel
-        kernel_list = self.warp_kernel
-        if bc.needs_mesh_distance:
-            wp.launch(
-                kernel_list[1],
-                inputs=[mesh_id, bc_id, distances, bc_mask, missing_mask],
-                dim=bc_mask.shape[1:],
-            )
-        else:
-            wp.launch(
-                kernel_list[0],
-                inputs=[mesh_id, bc_id, bc_mask, missing_mask],
-                dim=bc_mask.shape[1:],
-            )
+        wp.launch(
+            self.warp_kernel,
+            inputs=[mesh_id, bc_id, distances, bc_mask, missing_mask, wp.static(bc.needs_mesh_distance)],
+            dim=bc_mask.shape[1:],
+        )
         wp.launch(
             self.resolve_out_of_bound_kernel,
             inputs=[bc_id, bc_mask, missing_mask],
diff --git a/xlb/operator/boundary_masker/ray.py b/xlb/operator/boundary_masker/ray.py
index d5ec27c2..604adad8 100644
--- a/xlb/operator/boundary_masker/ray.py
+++ b/xlb/operator/boundary_masker/ray.py
@@ -29,37 +29,12 @@ def _construct_warp(self):
 
         @wp.kernel
         def kernel(
-            mesh_id: wp.uint64,
-            id_number: wp.int32,
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
-        ):
-            # get index
-            i, j, k = wp.tid()
-
-            # Get local indices
-            index = wp.vec3i(i, j, k)
-
-            # position of the point
-            pos_bc_cell = self.index_to_position(index)
-
-            for l in range(1, _q):
-                _dir = wp.vec3f(wp.float32(_c[0, l]), wp.float32(_c[1, l]), wp.float32(_c[2, l]))
-                # Max length depends on ray direction (diagonals are longer)
-                max_length = wp.length(_dir)
-                query = wp.mesh_query_ray(mesh_id, pos_bc_cell, _dir / max_length, max_length)
-                if query.result:
-                    # Set the boundary id and missing_mask
-                    bc_mask[0, index[0], index[1], index[2]] = wp.uint8(id_number)
-                    missing_mask[_opp_indices[l], index[0], index[1], index[2]] = True
-
-        @wp.kernel
-        def kernel_with_distance(
             mesh_id: wp.uint64,
             id_number: wp.int32,
             distances: wp.array4d(dtype=Any),
             bc_mask: wp.array4d(dtype=wp.uint8),
             missing_mask: wp.array4d(dtype=wp.bool),
+            needs_mesh_distance: bool,
         ):
             # get index
             i, j, k = wp.tid()
@@ -68,28 +43,30 @@ def kernel_with_distance(
             index = wp.vec3i(i, j, k)
 
             # position of the point
-            pos_bc_cell = self.index_to_position(index)
+            cell_center_pos = self.index_to_position(index)
 
             # Find the fractional distance to the mesh in each direction
-            for l in range(1, _q):
-                _dir = wp.vec3f(wp.float32(_c[0, l]), wp.float32(_c[1, l]), wp.float32(_c[2, l]))
+            for direction_idx in range(1, _q):
+                direction_vec = wp.vec3f(wp.float32(_c[0, direction_idx]), wp.float32(_c[1, direction_idx]), wp.float32(_c[2, direction_idx]))
                 # Max length depends on ray direction (diagonals are longer)
-                max_length = wp.length(_dir)
-                query = wp.mesh_query_ray(mesh_id, pos_bc_cell, _dir / max_length, max_length)
+                max_length = wp.length(direction_vec)
+                query = wp.mesh_query_ray(mesh_id, cell_center_pos, direction_vec / max_length, max_length)
                 if query.result:
                     # Set the boundary id and missing_mask
                     bc_mask[0, index[0], index[1], index[2]] = wp.uint8(id_number)
-                    missing_mask[_opp_indices[l], index[0], index[1], index[2]] = True
+                    missing_mask[_opp_indices[direction_idx], index[0], index[1], index[2]] = True
+
+                    # If we don't need the mesh distance, we can return early
+                    if not needs_mesh_distance:
+                        continue
 
                     # get position of the mesh triangle that intersects with the ray
                     pos_mesh = wp.mesh_eval_position(mesh_id, query.face, query.u, query.v)
-                    dist = wp.length(pos_mesh - pos_bc_cell)
+                    dist = wp.length(pos_mesh - cell_center_pos)
                     weight = self.store_dtype(dist / max_length)
-                    distances[l, index[0], index[1], index[2]] = weight
-                    # if weight < 0.0 or weight > 1.0:
-                    #     wp.printf("Got bad weight %f at %d,%d,%d\n", weight, index[0], index[1], index[2])
+                    distances[direction_idx, index[0], index[1], index[2]] = weight
 
-        return None, [kernel, kernel_with_distance]
+        return None, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(
diff --git a/xlb/operator/boundary_masker/winding.py b/xlb/operator/boundary_masker/winding.py
index fc813012..6e032f40 100644
--- a/xlb/operator/boundary_masker/winding.py
+++ b/xlb/operator/boundary_masker/winding.py
@@ -29,56 +29,12 @@ def _construct_warp(self):
 
         @wp.kernel
         def kernel(
-            mesh_id: wp.uint64,
-            id_number: wp.int32,
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
-        ):
-            # get index
-            i, j, k = wp.tid()
-
-            # Get local indices
-            index = wp.vec3i(i, j, k)
-
-            # position of the point
-            pos_bc_cell = self.index_to_position(index)
-
-            # Compute the maximum length
-            max_length = wp.sqrt(
-                (wp.float32(bc_mask.shape[1])) ** 2.0 + (wp.float32(bc_mask.shape[2])) ** 2.0 + (wp.float32(bc_mask.shape[3])) ** 2.0
-            )
-
-            # evaluate if point is inside mesh
-            query = wp.mesh_query_point_sign_winding_number(mesh_id, pos_bc_cell, max_length)
-            if query.result:
-                # set point to be solid
-                if query.sign <= 0:  # TODO: fix this
-                    # Make solid voxel
-                    bc_mask[0, index[0], index[1], index[2]] = wp.uint8(255)
-
-                    # Find the fractional distance to the mesh in each direction
-                    for l in range(1, _q):
-                        _dir = wp.vec3f(wp.float32(_c[0, l]), wp.float32(_c[1, l]), wp.float32(_c[2, l]))
-                        # Max length depends on ray direction (diagonals are longer)
-                        max_length = wp.length(_dir)
-                        query_dir = wp.mesh_query_ray(mesh_id, pos_bc_cell, _dir / max_length, max_length)
-                        if query_dir.result:
-                            # Get the index of the streaming direction
-                            push_index = wp.vec3i()
-                            for d in range(self.velocity_set.d):
-                                push_index[d] = index[d] + _c[d, l]
-
-                            # Set the boundary id and missing_mask
-                            bc_mask[0, push_index[0], push_index[1], push_index[2]] = wp.uint8(id_number)
-                            missing_mask[l, push_index[0], push_index[1], push_index[2]] = True
-
-        @wp.kernel
-        def kernel_with_distance(
             mesh_id: wp.uint64,
             id_number: wp.int32,
             distances: wp.array4d(dtype=Any),
             bc_mask: wp.array4d(dtype=wp.uint8),
             missing_mask: wp.array4d(dtype=wp.bool),
+            needs_mesh_distance: bool,
         ):
             # get index
             i, j, k = wp.tid()
@@ -103,31 +59,33 @@ def kernel_with_distance(
                     bc_mask[0, index[0], index[1], index[2]] = wp.uint8(255)
 
                     # Find the fractional distance to the mesh in each direction
-                    for l in range(1, _q):
-                        _dir = wp.vec3f(wp.float32(_c[0, l]), wp.float32(_c[1, l]), wp.float32(_c[2, l]))
+                    for direction_idx in range(1, _q):
+                        direction_vec = wp.vec3f(wp.float32(_c[0, direction_idx]), wp.float32(_c[1, direction_idx]), wp.float32(_c[2, direction_idx]))
                         # Max length depends on ray direction (diagonals are longer)
-                        max_length = wp.length(_dir)
-                        query_dir = wp.mesh_query_ray(mesh_id, pos_cell, _dir / max_length, max_length)
+                        max_length = wp.length(direction_vec)
+                        query_dir = wp.mesh_query_ray(mesh_id, pos_cell, direction_vec / max_length, max_length)
                         if query_dir.result:
                             # Get the index of the streaming direction
                             push_index = wp.vec3i()
                             for d in range(self.velocity_set.d):
-                                push_index[d] = index[d] + _c[d, l]
+                                push_index[d] = index[d] + _c[d, direction_idx]
 
                             # Set the boundary id and missing_mask
                             bc_mask[0, push_index[0], push_index[1], push_index[2]] = wp.uint8(id_number)
-                            missing_mask[l, push_index[0], push_index[1], push_index[2]] = True
+                            missing_mask[direction_idx, push_index[0], push_index[1], push_index[2]] = True
+
+                            # If we don't need the mesh distance, we can return early
+                            if not needs_mesh_distance:
+                                continue
 
                             # get position of the mesh triangle that intersects with the ray
                             pos_mesh = wp.mesh_eval_position(mesh_id, query_dir.face, query_dir.u, query_dir.v)
-                            pos_bc_cell = self.index_to_position(push_index)
-                            dist = wp.length(pos_mesh - pos_bc_cell)
+                            cell_center_pos = self.index_to_position(push_index)
+                            dist = wp.length(pos_mesh - cell_center_pos)
                             weight = self.store_dtype(dist / max_length)
-                            distances[_opp_indices[l], push_index[0], push_index[1], push_index[2]] = weight
-                            # if weight < 0.0 or weight > 1.0:
-                            #     wp.printf("Got bad weight %f at %d,%d,%d\n", weight, push_index[0], push_index[1], push_index[2])
+                            distances[_opp_indices[direction_idx], push_index[0], push_index[1], push_index[2]] = weight
 
-        return None, [kernel, kernel_with_distance]
+        return None, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 3a17d780..9f02d7a6 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -197,7 +197,7 @@ def jax_implementation(self, f_0, f_1, bc_mask, missing_mask, omega, timestep):
 
         # Apply collision type boundary conditions
         for bc in self.boundary_conditions:
-            f_post_collision = bc.assemble_dynamic_data(f_post_stream, f_post_collision, bc_mask, missing_mask)
+            f_post_collision = bc.assemble_auxiliary_data(f_post_stream, f_post_collision, bc_mask, missing_mask)
             if bc.implementation_step == ImplementationStep.COLLISION:
                 f_post_collision = bc(
                     f_post_stream,
@@ -255,7 +255,7 @@ def apply_bc(
                             f_result = wp.static(self.boundary_conditions[i].warp_functional)(index, timestep, _missing_mask, f_0, f_1, f_pre, f_post)
                     if wp.static(self.boundary_conditions[i].id in extrapolation_outflow_bc_ids):
                         if _boundary_id == wp.static(self.boundary_conditions[i].id):
-                            f_result = wp.static(self.boundary_conditions[i].assemble_dynamic_data)(
+                            f_result = wp.static(self.boundary_conditions[i].assemble_auxiliary_data)(
                                 index, timestep, _missing_mask, f_0, f_1, f_pre, f_post
                             )
             return f_result
diff --git a/xlb/utils/__init__.py b/xlb/utils/__init__.py
index 213e936d..3c8032e2 100644
--- a/xlb/utils/__init__.py
+++ b/xlb/utils/__init__.py
@@ -1,14 +1,9 @@
-from .utils import (
-    downsample_field,
-    save_image,
-    save_fields_vtk,
-    save_BCs_vtk,
-    rotate_geometry,
-    voxelize_stl,
-    axangle2mat,
-    save_fields_hdf5,
-    voxelize_stl_open3d,
-    q_criterion,
-    map_field_vtk,
-    map_field_vtk_interpolator,
-)
+from .utils import (
+    downsample_field,
+    save_image,
+    save_fields_vtk,
+    save_BCs_vtk,
+    rotate_geometry,
+    voxelize_stl,
+    axangle2mat,
+)
diff --git a/xlb/utils/utils.py b/xlb/utils/utils.py
index dda4534f..0a9858a5 100644
--- a/xlb/utils/utils.py
+++ b/xlb/utils/utils.py
@@ -1,651 +1,316 @@
-import numpy as np
-import matplotlib.pylab as plt
-from matplotlib import cm
-from time import time
-import pyvista as pv
-from scipy.interpolate import RegularGridInterpolator
-from scipy.ndimage import map_coordinates
-from jax.image import resize
-from jax import jit
-import jax.numpy as jnp
-from functools import partial
-import trimesh
-import vtk
-import open3d as o3d
-import h5py
-
-import os
-import __main__
-
-
-@partial(jit, static_argnums=(1, 2))
-def downsample_field(field, factor, method="bicubic"):
-    """
-    Downsample a JAX array by a factor of `factor` along each axis.
-
-    Parameters
-    ----------
-    field : jax.numpy.ndarray
-        The input vector field to be downsampled. This should be a 3D or 4D JAX array where the last dimension is 2 or 3 (vector components).
-    factor : int
-        The factor by which to downsample the field. The dimensions of the field will be divided by this factor.
-    method : str, optional
-        The method to use for downsampling. Default is 'bicubic'.
-
-    Returns
-    -------
-    jax.numpy.ndarray
-        The downsampled field.
-    """
-    if factor == 1:
-        return field
-    else:
-        new_shape = tuple(dim // factor for dim in field.shape[:-1])
-        downsampled_components = []
-        for i in range(field.shape[-1]):  # Iterate over the last dimension (vector components)
-            resized = resize(field[..., i], new_shape, method=method)
-            downsampled_components.append(resized)
-
-        return jnp.stack(downsampled_components, axis=-1)
-
-
-def save_image(fld, timestep=None, prefix=None, **kwargs):
-    """
-    Save an image of a field at a given timestep.
-
-    Parameters
-    ----------
-    timestep : int
-        The timestep at which the field is being saved.
-    fld : jax.numpy.ndarray
-        The field to be saved. This should be a 2D or 3D JAX array. If the field is 3D, the magnitude of the field will be calculated and saved.
-    prefix : str, optional
-        A prefix to be added to the filename. The filename will be the name of the main script file by default.
-
-    Returns
-    -------
-    None
-
-    Notes
-    -----
-    This function saves the field as an image in the PNG format.
-    The filename is based on the name of the main script file, the provided prefix, and the timestep number.
-    If the field is 3D, the magnitude of the field is calculated and saved.
-    The image is saved with the 'nipy_spectral' colormap and the origin set to 'lower'.
-    """
-    if prefix is None:
-        fname = os.path.basename(__main__.__file__)
-        fname = os.path.splitext(fname)[0]
-    else:
-        fname = prefix
-
-    if timestep is not None:
-        fname = fname + "_" + str(timestep).zfill(4)
-
-    if len(fld.shape) > 3:
-        raise ValueError("The input field should be 2D!")
-    if len(fld.shape) == 3:
-        fld = np.sqrt(fld[0, ...] ** 2 + fld[0, ...] ** 2)
-
-    plt.clf()
-    kwargs.pop("cmap", None)
-    plt.imsave(fname + ".png", fld.T, cmap=cm.nipy_spectral, origin="lower", **kwargs)
-
-
-def save_fields_vtk(fields, timestep, output_dir=".", prefix="fields", shift_coords=(0, 0, 0), scale=1):
-    """
-    Save VTK fields to the specified directory, shifting the coordinates if needed.
-
-    Parameters
-    ----------
-    timestep (int): The timestep number to be associated with the saved fields.
-    fields (Dict[str, np.ndarray]): A dictionary of fields to be saved. Each field must be an array-like object
-        with dimensions (nx, ny) for 2D fields or (nx, ny, nz) for 3D fields, where:
-            - nx : int, number of grid points along the x-axis
-            - ny : int, number of grid points along the y-axis
-            - nz : int, number of grid points along the z-axis (for 3D fields only)
-        The key value for each field in the dictionary must be a string containing the name of the field.
-    output_dir (str, optional, default: '.'): The directory in which to save the VTK files. Defaults to the current directory.
-    prefix (str, optional, default: 'fields'): A prefix to be added to the filename. Defaults to 'fields'.
-    shift_coords (tuple, optional, default: (0, 0, 0)): The amount to shift in the x, y, and z directions.
-    scale (int, optional, default: 1): The amount to scale the geometry.
-
-    Returns
-    -------
-    None
-
-    Notes
-    -----
-    This function saves the VTK fields in the specified directory, with filenames based on the provided timestep number
-    and the filename. For example, if the timestep number is 10 and the file name is fields, the VTK file
-    will be saved as 'fields_0000010.vtk'in the specified directory.
-
-    """
-    start = time()
-    # Assert that all fields have the same dimensions
-    for key, value in fields.items():
-        if key == list(fields.keys())[0]:
-            dimensions = value.shape
-        else:
-            assert value.shape == dimensions, "All fields must have the same dimensions!"
-
-    output_filename = os.path.join(output_dir, prefix + "_" + f"{timestep:08d}.vtk")
-
-    # Add 1 to the dimensions tuple as we store cell values
-    dimensions = tuple([dim + 1 for dim in dimensions])
-
-    # Create a uniform grid
-    if value.ndim == 2:
-        dimensions = dimensions + (1,)
-
-    grid = pv.ImageData(dimensions=dimensions, origin=shift_coords, spacing=(scale, scale, scale))
-
-    # Add the fields to the grid
-    for key, value in fields.items():
-        grid[key] = value.flatten(order="F")
-
-    # Save the grid to a VTK file
-    grid.save(output_filename, binary=True)
-    print(f"Saved {output_filename} in {time() - start:.6f} seconds.")
-
-
-def map_field_vtk_interpolator(
-    field, stl_filename, voxel_size, output_dir=".", prefix="mapped_field", origin=[0, 0, 0], method="cubic", normals=True
-):
-    """
-    Map a volumetric field onto an STL mesh using RegularGridInterpolator.
-
-    Parameters
-    ----------
-    field : np.ndarray
-        3D array representing the volumetric field.
-    stl_filename : str
-        Path to the STL file.
-    voxel_size : float
-        Size of a voxel along each axis.
-    output_dir : str, optional
-        Directory to save the output VTK file.
-    prefix : str, optional
-        Filename prefix.
-    origin : list or tuple of float, optional
-        Origin of the grid.
-    method : str, optional
-        Interpolation method (e.g., 'cubic').
-    normals : bool, optional
-        If True, use normal-direction averaging by sampling points offset along the surface normal;
-        if False, simply sample the field at the surface points.
-
-    Returns
-    -------
-    None
-    """
-
-    print("Mapping field to stl with {} method".format("normal averaging" if normals else "original sampling"))
-    start = time()
-    grid_shape = field.shape
-
-    # Create coordinate arrays based on the origin and voxel size.
-    x = origin[0] + np.arange(grid_shape[0]) * voxel_size
-    y = origin[1] + np.arange(grid_shape[1]) * voxel_size
-    z = origin[2] + np.arange(grid_shape[2]) * voxel_size
-
-    # Set up the interpolation function.
-    interp_func = RegularGridInterpolator((x, y, z), field, method=method, bounds_error=False, fill_value=None)
-
-    # Load the STL mesh.
-    stl_mesh = pv.read(stl_filename)
-
-    if normals:
-        # Compute normals if not already available.
-        if "Normals" not in stl_mesh.point_data:
-            stl_mesh = stl_mesh.compute_normals()
-        normals_arr = stl_mesh.point_normals  # shape (N, 3)
-        points = stl_mesh.points  # shape (N, 3)
-
-        # Define offsets along the normal: sample 2 voxels in both directions including the surface.
-        offsets = np.array([-2, 2]) * voxel_size  # shape (5,)
-        # offsets = np.array([-2, -1, 0, 1, 2]) * voxel_size  # shape (5,)
-        # Generate sample points along the normal for each mesh point.
-        sample_points = points[:, np.newaxis, :] + offsets[np.newaxis, :, np.newaxis] * normals_arr[:, np.newaxis, :]
-        sample_points_reshaped = sample_points.reshape(-1, 3)
-
-        # Interpolate the field at each of the sample points.
-        field_values = interp_func(sample_points_reshaped)
-        field_values = field_values.reshape(points.shape[0], len(offsets))
-        # Average the values along the normal offset direction.
-        field_mapped = np.mean(field_values, axis=1)
-    else:
-        # Original: simply sample the field at the surface points.
-        points = stl_mesh.points
-        field_mapped = interp_func(points)
-
-    # Assign the mapped field to the mesh and save.
-    stl_mesh["field"] = field_mapped
-    output_filename = os.path.join(output_dir, prefix + ".vtk")
-    stl_mesh.save(output_filename)
-    print(f"Saved {output_filename} in {time() - start:.6f} seconds.")
-
-
-def map_field_vtk(field, stl_filename, output_dir=".", prefix="mapped_field", shift_coords=(0, 0, 0), scale=1, normals=True):
-    """
-    Save VTK fields to the specified directory by probing a uniform grid
-    generated from a field array onto an STL mesh. If normals is True, for
-    each STL point the field is averaged over points offset along the surface normal.
-
-    Parameters
-    ----------
-    field : np.ndarray
-        The field data (2D or 3D) to be mapped.
-    stl_filename : str
-        Path to the STL file.
-    output_dir : str, optional
-        Directory to save the output VTK file.
-    prefix : str, optional
-        Filename prefix.
-    shift_coords : tuple, optional
-        Origin (shift) for the uniform grid.
-    scale : int or float, optional
-        Spacing of the uniform grid.
-    normals : bool, optional
-        If True, average field values along the surface normal (sampling 2 voxels on either side);
-        if False, use the original probe method.
-
-    Returns
-    -------
-    None
-    """
-    start = time()
-    method_str = "normal averaging" if normals else "original sampling"
-    print(f"Mapping field to stl with {method_str}")
-    output_filename = os.path.join(output_dir, prefix + ".vtk")
-
-    # Create the uniform grid dimensions (note: cell values require dimensions + 1).
-    dimensions = tuple(dim + 1 for dim in field.shape)
-    if field.ndim == 2:
-        dimensions = dimensions + (1,)
-
-    # Create a uniform grid (ImageData) with the specified origin and spacing.
-    grid = pv.ImageData(dimensions=dimensions, origin=shift_coords, spacing=(scale, scale, scale))
-    grid.cell_data["field"] = field.flatten(order="F")
-    grid = grid.cell_data_to_point_data()
-
-    # Load the STL mesh.
-    stl_mesh = pv.read(stl_filename)
-
-    if normals:
-        # Compute normals if not available.
-        if "Normals" not in stl_mesh.point_data:
-            stl_mesh = stl_mesh.compute_normals()
-        normals_arr = stl_mesh.point_normals  # shape (N, 3)
-        points = stl_mesh.points  # shape (N, 3)
-
-        # Define offsets along the normal: sample 2 voxels in both directions.
-        offsets = np.array([-2, 2]) * scale
-        # offsets = np.array([-2, -1, 0, 1, 2]) * scale
-        # Generate sample points along the normal for each STL point.
-        sample_points = points[:, np.newaxis, :] + offsets[np.newaxis, :, np.newaxis] * normals_arr[:, np.newaxis, :]
-        sample_points_reshaped = sample_points.reshape(-1, 3)
-
-        # Create a PolyData object from these sample points.
-        samples_pd = pv.PolyData(sample_points_reshaped)
-
-        # Use vtkProbeFilter to sample the grid at these sample locations.
-        probe = vtk.vtkProbeFilter()
-        probe.SetInputData(samples_pd)
-        probe.SetSourceData(grid)
-        probe.Update()
-        sampled = pv.wrap(probe.GetOutput())
-        sample_field = sampled.point_data["field"]
-        averaged_field = np.mean(sample_field.reshape(-1, len(offsets)), axis=1)
-
-        # Assign the averaged field to the mesh.
-        stl_mesh["field"] = averaged_field
-        stl_mesh.save(output_filename)
-    else:
-        # Original method: use vtkProbeFilter on the STL geometry.
-        stl_vtk = stl_mesh.extract_geometry()
-        probe = vtk.vtkProbeFilter()
-        probe.SetInputData(stl_vtk)
-        probe.SetSourceData(grid)
-        probe.Update()
-        stl_mapped = pv.wrap(probe.GetOutput())
-        stl_mapped.save(output_filename)
-
-    print(f"Saved {output_filename} in {time() - start:.6f} seconds.")
-
-
-def save_BCs_vtk(timestep, BCs, gridInfo, output_dir="."):
-    """
-    Save boundary conditions as VTK format to the specified directory.
-
-    Parameters
-    ----------
-    timestep (int): The timestep number to be associated with the saved fields.
-    BCs (List[BC]): A list of boundary conditions to be saved. Each boundary condition must be an object of type BC.
-
-    Returns
-    -------
-    None
-
-    Notes
-    -----
-    This function saves the boundary conditions in the specified directory, with filenames based on the provided timestep number
-    and the filename. For example, if the timestep number is 10, the VTK file
-    will be saved as 'BCs_0000010.vtk'in the specified directory.
-    """
-
-    # Create a uniform grid
-    if gridInfo["nz"] == 0:
-        gridDimensions = (gridInfo["nx"] + 1, gridInfo["ny"] + 1, 1)
-        fieldDimensions = (gridInfo["nx"], gridInfo["ny"], 1)
-    else:
-        gridDimensions = (gridInfo["nx"] + 1, gridInfo["ny"] + 1, gridInfo["nz"] + 1)
-        fieldDimensions = (gridInfo["nx"], gridInfo["ny"], gridInfo["nz"])
-
-    grid = pv.ImageData(dimensions=gridDimensions)
-
-    # Dictionary to keep track of encountered BC names
-    bcNamesCount = {}
-
-    for bc in BCs:
-        bcName = bc.name
-        if bcName in bcNamesCount:
-            bcNamesCount[bcName] += 1
-        else:
-            bcNamesCount[bcName] = 0
-        bcName += f"_{bcNamesCount[bcName]}"
-
-        if bc.isDynamic:
-            bcIndices, _ = bc.update_function(timestep)
-        else:
-            bcIndices = bc.indices
-
-        # Convert indices to 1D indices
-        if gridInfo["dim"] == 2:
-            bcIndices = np.ravel_multi_index(bcIndices, fieldDimensions[:-1], order="F")
-        else:
-            bcIndices = np.ravel_multi_index(bcIndices, fieldDimensions, order="F")
-
-        grid[bcName] = np.zeros(fieldDimensions, dtype=bool).flatten(order="F")
-        grid[bcName][bcIndices] = True
-
-    # Save the grid to a VTK file
-    output_filename = os.path.join(output_dir, "BCs_" + f"{timestep:07d}.vtk")
-
-    start = time()
-    grid.save(output_filename, binary=True)
-    print(f"Saved {output_filename} in {time() - start:.6f} seconds.")
-
-
-def rotate_geometry(indices, origin, axis, angle):
-    """
-    Rotates a voxelized mesh around a given axis.
-
-    Parameters
-    ----------
-    indices : array-like
-        The indices of the voxels in the mesh.
-    origin : array-like
-        The coordinates of the origin of the rotation axis.
-    axis : array-like
-        The direction vector of the rotation axis. This should be a 3-element sequence.
-    angle : float
-        The angle by which to rotate the mesh, in radians.
-
-    Returns
-    -------
-    tuple
-        The indices of the voxels in the rotated mesh.
-
-    Notes
-    -----
-    This function rotates the mesh by applying a rotation matrix to the voxel indices. The rotation matrix is calculated
-    using the axis-angle representation of rotations. The origin of the rotation axis is assumed to be at (0, 0, 0).
-    """
-    indices_rotated = (jnp.array(indices).T - origin) @ axangle2mat(axis, angle) + origin
-    return tuple(jnp.rint(indices_rotated).astype("int32").T)
-
-
-def voxelize_stl(stl_filename, length_lbm_unit=None, tranformation_matrix=None, pitch=None):
-    """
-    Converts an STL file to a voxelized mesh.
-
-    Parameters
-    ----------
-    stl_filename : str
-        The name of the STL file to be voxelized.
-    length_lbm_unit : float, optional
-        The unit length in LBM. Either this or 'pitch' must be provided.
-    tranformation_matrix : array-like, optional
-        A transformation matrix to be applied to the mesh before voxelization.
-    pitch : float, optional
-        The pitch of the voxel grid. Either this or 'length_lbm_unit' must be provided.
-
-    Returns
-    -------
-    trimesh.VoxelGrid, float
-        The voxelized mesh and the pitch of the voxel grid.
-
-    Notes
-    -----
-    This function uses the trimesh library to load the STL file and voxelized the mesh. If a transformation matrix is
-    provided, it is applied to the mesh before voxelization. The pitch of the voxel grid is calculated based on the
-    maximum extent of the mesh and the provided lattice Boltzmann unit length, unless a pitch is provided directly.
-    """
-    if length_lbm_unit is None and pitch is None:
-        raise ValueError("Either 'length_lbm_unit' or 'pitch' must be provided!")
-    mesh = trimesh.load_mesh(stl_filename, process=False)
-    length_phys_unit = mesh.extents.max()
-    if tranformation_matrix is not None:
-        mesh.apply_transform(tranformation_matrix)
-    if pitch is None:
-        pitch = length_phys_unit / length_lbm_unit
-    mesh_voxelized = mesh.voxelized(pitch=pitch)
-    return mesh_voxelized, pitch
-
-
-def save_fields_hdf5(fields, timestep, output_dir=".", prefix="fields", shift_coords=(0, 0, 0), scale=1, compression="gzip", compression_opts=0):
-    start = time()
-    filename = str(prefix + "_" + f"{timestep:08d}.h5")
-    output_filename = os.path.join(output_dir, filename)
-
-    # Determine the dimensions (assuming all fields have the same shape)
-    for key, value in fields.items():
-        if key == list(fields.keys())[0]:
-            dimensions = value.shape
-        else:
-            assert value.shape == dimensions, "All fields must have the same dimensions!"
-
-    with h5py.File(output_filename, "w") as f:
-        # Write field data with Fortran order to match the VTK convention
-        for key, value in fields.items():
-            value = np.transpose(value, (2, 1, 0))
-
-            dataset = f.create_dataset(key, data=value, dtype="float32", compression=compression, compression_opts=compression_opts)
-            dataset.attrs["origin"] = shift_coords
-            dataset.attrs["spacing"] = (scale, scale, scale)
-
-    # Write the XDMF file using HyperSlab to properly reference the HDF5 data
-    xdmf_filename = os.path.join(output_dir, prefix + "_" + f"{timestep:08d}.xdmf")
-    with open(xdmf_filename, "w") as xdmf:
-        xdmf.write(f"""<?xml version="1.0" ?>
-<Xdmf Version="3.0" xmlns:xi="http://www.w3.org/2001/XInclude">
-  <Domain>
-    <Grid Name="fields" GridType="Uniform">
-      <Topology TopologyType="3DCoRectMesh" Dimensions="{dimensions[2] + 1} {dimensions[1] + 1} {dimensions[0] + 1}"/>
-      <Geometry GeometryType="ORIGIN_DXDYDZ">
-        <DataItem Dimensions="3" NumberType="Float" Precision="4" Format="XML">
-          {shift_coords[2]} {shift_coords[1]} {shift_coords[0]}
-        </DataItem>
-        <DataItem Dimensions="3" NumberType="Float" Precision="4" Format="XML">
-          {scale} {scale} {scale}
-        </DataItem>
-      </Geometry>
-""")
-        for key in fields.keys():
-            xdmf.write(f"""
-      <Attribute Name="{key}" AttributeType="Scalar" Center="Cell">
-        <DataItem ItemType="HyperSlab" Dimensions="{dimensions[2]} {dimensions[1]} {dimensions[0]}" NumberType="Float" Precision="4" Format="HDF">
-          <DataItem Dimensions="3 3" Format="XML">
-            0 0 0
-            1 1 1
-            {dimensions[2]} {dimensions[1]} {dimensions[0]}
-          </DataItem>
-          <DataItem Dimensions="{dimensions[2]} {dimensions[1]} {dimensions[0]}" NumberType="Float" Precision="4" Format="HDF">
-            {filename}:/{key}
-          </DataItem>
-        </DataItem>
-      </Attribute>
-""")
-        xdmf.write("""
-    </Grid>
-  </Domain>
-</Xdmf>
-""")
-
-    print(f"Saved {output_filename} and {xdmf_filename} in {time() - start:.6f} seconds.")
-
-
-def axangle2mat(axis, angle, is_normalized=False):
-    """Rotation matrix for rotation angle `angle` around `axis`
-    Parameters
-    ----------
-    axis : 3 element sequence
-       vector specifying axis for rotation.
-    angle : scalar
-       angle of rotation in radians.
-    is_normalized : bool, optional
-       True if `axis` is already normalized (has norm of 1).  Default False.
-    Returns
-    -------
-    mat : array shape (3,3)
-       rotation matrix for specified rotation
-    Notes
-    -----
-    From : https://github.com/matthew-brett/transforms3d
-    Ref : http://en.wikipedia.org/wiki/Rotation_matrix#Axis_and_angle
-    """
-    x, y, z = axis
-    if not is_normalized:
-        n = jnp.sqrt(x * x + y * y + z * z)
-        x = x / n
-        y = y / n
-        z = z / n
-    c = jnp.cos(angle)
-    s = jnp.sin(angle)
-    C = 1 - c
-    xs = x * s
-    ys = y * s
-    zs = z * s
-    xC = x * C
-    yC = y * C
-    zC = z * C
-    xyC = x * yC
-    yzC = y * zC
-    zxC = z * xC
-    return jnp.array([
-        [x * xC + c, xyC - zs, zxC + ys],
-        [xyC + zs, y * yC + c, yzC - xs],
-        [zxC - ys, yzC + xs, z * zC + c],
-    ])
-
-
-def voxelize_stl_open3d(stl_filename, length_lbm_unit):
-    # Load the STL file
-    mesh = o3d.io.read_triangle_mesh(stl_filename)
-    print("..Model read")
-    # Compute the voxel grid from the mesh
-    voxel_grid = o3d.geometry.VoxelGrid.create_from_triangle_mesh(mesh, voxel_size=length_lbm_unit)
-    print("...Grid created")
-    # Get the bounding box of the voxel grid
-    bbox = voxel_grid.get_axis_aligned_bounding_box()
-
-    # Calculate the number of voxels along each axis
-    grid_size = np.ceil((bbox.get_max_bound() - bbox.get_min_bound()) / length_lbm_unit).astype(int)
-
-    # Initialize an empty 3D array based on the calculated grid size
-    voxel_matrix = np.zeros(grid_size, dtype=bool)
-
-    # Convert voxel indices to a boolean matrix
-    for voxel in voxel_grid.get_voxels():
-        x, y, z = voxel.grid_index
-        voxel_matrix[x, y, z] = True
-
-    # Return the voxel matrix and the bounding box corners
-    return voxel_matrix, bbox.get_box_points()
-
-
-@partial(jit)
-def q_criterion(u, omega=2.0):
-    # Compute derivatives
-    u_x = u[0, ...]
-    u_y = u[1, ...]
-    u_z = u[2, ...]
-
-    # Compute derivatives
-    u_x_dx = (u_x[2:, 1:-1, 1:-1] - u_x[:-2, 1:-1, 1:-1]) / 2
-    u_x_dy = (u_x[1:-1, 2:, 1:-1] - u_x[1:-1, :-2, 1:-1]) / 2
-    u_x_dz = (u_x[1:-1, 1:-1, 2:] - u_x[1:-1, 1:-1, :-2]) / 2
-    u_y_dx = (u_y[2:, 1:-1, 1:-1] - u_y[:-2, 1:-1, 1:-1]) / 2
-    u_y_dy = (u_y[1:-1, 2:, 1:-1] - u_y[1:-1, :-2, 1:-1]) / 2
-    u_y_dz = (u_y[1:-1, 1:-1, 2:] - u_y[1:-1, 1:-1, :-2]) / 2
-    u_z_dx = (u_z[2:, 1:-1, 1:-1] - u_z[:-2, 1:-1, 1:-1]) / 2
-    u_z_dy = (u_z[1:-1, 2:, 1:-1] - u_z[1:-1, :-2, 1:-1]) / 2
-    u_z_dz = (u_z[1:-1, 1:-1, 2:] - u_z[1:-1, 1:-1, :-2]) / 2
-
-    # Compute vorticity
-    mu_x = u_z_dy - u_y_dz
-    mu_y = u_x_dz - u_z_dx
-    mu_z = u_y_dx - u_x_dy
-    norm_mu = jnp.sqrt(mu_x**2 + mu_y**2 + mu_z**2)
-
-    # Compute strain rate
-    s_0_0 = u_x_dx
-    s_0_1 = 0.5 * (u_x_dy + u_y_dx)
-    s_0_2 = 0.5 * (u_x_dz + u_z_dx)
-    s_1_0 = s_0_1
-    s_1_1 = u_y_dy
-    s_1_2 = 0.5 * (u_y_dz + u_z_dy)
-    s_2_0 = s_0_2
-    s_2_1 = s_1_2
-    s_2_2 = u_z_dz
-    s_dot_s = s_0_0**2 + s_0_1**2 + s_0_2**2 + s_1_0**2 + s_1_1**2 + s_1_2**2 + s_2_0**2 + s_2_1**2 + s_2_2**2
-
-    # Compute Viscosity from Omega
-    mu = ((1 / omega) - 0.5) / 3.0
-
-    # Compute shear stress components
-    tau_xy = 2 * mu * s_0_1
-    tau_xz = 2 * mu * s_0_2
-    tau_yz = 2 * mu * s_1_2
-
-    # Compute shear stress magnitude
-    tau_magnitude = jnp.sqrt(tau_xy**2 + tau_xz**2 + tau_yz**2)
-
-    # Compute omega
-    omega_0_0 = 0.0
-    omega_0_1 = 0.5 * (u_x_dy - u_y_dx)
-    omega_0_2 = 0.5 * (u_x_dz - u_z_dx)
-    omega_1_0 = -omega_0_1
-    omega_1_1 = 0.0
-    omega_1_2 = 0.5 * (u_y_dz - u_z_dy)
-    omega_2_0 = -omega_0_2
-    omega_2_1 = -omega_1_2
-    omega_2_2 = 0.0
-    omega_dot_omega = (
-        omega_0_0**2 + omega_0_1**2 + omega_0_2**2 + omega_1_0**2 + omega_1_1**2 + omega_1_2**2 + omega_2_0**2 + omega_2_1**2 + omega_2_2**2
-    )
-
-    # Compute q-criterion
-    q = 0.5 * (omega_dot_omega - s_dot_s)
-
-    # Pad outputs to match original shape
-    pad_width = ((1, 1), (1, 1), (1, 1))  # Add 1 voxel on each side in x, y, z
-    norm_mu = jnp.pad(norm_mu, pad_width, mode="constant", constant_values=0)
-    q = jnp.pad(q, pad_width, mode="constant", constant_values=0)
-    tau_xy = jnp.pad(tau_xy, pad_width, mode="constant", constant_values=0)
-    tau_xz = jnp.pad(tau_xz, pad_width, mode="constant", constant_values=0)
-    tau_yz = jnp.pad(tau_yz, pad_width, mode="constant", constant_values=0)
-    tau_magnitude = jnp.pad(tau_magnitude, pad_width, mode="constant", constant_values=0)
-
-    return norm_mu, q, tau_xy, tau_xz, tau_yz, tau_magnitude
+import numpy as np
+import matplotlib.pylab as plt
+from matplotlib import cm
+from time import time
+import pyvista as pv
+from jax.image import resize
+from jax import jit
+import jax.numpy as jnp
+from functools import partial
+import trimesh
+
+import os
+import __main__
+
+
+@partial(jit, static_argnums=(1, 2))
+def downsample_field(field, factor, method="bicubic"):
+    """
+    Downsample a JAX array by a factor of `factor` along each axis.
+
+    Parameters
+    ----------
+    field : jax.numpy.ndarray
+        The input vector field to be downsampled. This should be a 3D or 4D JAX array where the last dimension is 2 or 3 (vector components).
+    factor : int
+        The factor by which to downsample the field. The dimensions of the field will be divided by this factor.
+    method : str, optional
+        The method to use for downsampling. Default is 'bicubic'.
+
+    Returns
+    -------
+    jax.numpy.ndarray
+        The downsampled field.
+    """
+    if factor == 1:
+        return field
+    else:
+        new_shape = tuple(dim // factor for dim in field.shape[:-1])
+        downsampled_components = []
+        for i in range(field.shape[-1]):  # Iterate over the last dimension (vector components)
+            resized = resize(field[..., i], new_shape, method=method)
+            downsampled_components.append(resized)
+
+        return jnp.stack(downsampled_components, axis=-1)
+
+
+def save_image(fld, timestep=None, prefix=None, **kwargs):
+    """
+    Save an image of a field at a given timestep.
+
+    Parameters
+    ----------
+    timestep : int
+        The timestep at which the field is being saved.
+    fld : jax.numpy.ndarray
+        The field to be saved. This should be a 2D or 3D JAX array. If the field is 3D, the magnitude of the field will be calculated and saved.
+    prefix : str, optional
+        A prefix to be added to the filename. The filename will be the name of the main script file by default.
+
+    Returns
+    -------
+    None
+
+    Notes
+    -----
+    This function saves the field as an image in the PNG format.
+    The filename is based on the name of the main script file, the provided prefix, and the timestep number.
+    If the field is 3D, the magnitude of the field is calculated and saved.
+    The image is saved with the 'nipy_spectral' colormap and the origin set to 'lower'.
+    """
+    if prefix is None:
+        fname = os.path.basename(__main__.__file__)
+        fname = os.path.splitext(fname)[0]
+    else:
+        fname = prefix
+
+    if timestep is not None:
+        fname = fname + "_" + str(timestep).zfill(4)
+
+    if len(fld.shape) > 3:
+        raise ValueError("The input field should be 2D!")
+    if len(fld.shape) == 3:
+        fld = np.sqrt(fld[0, ...] ** 2 + fld[0, ...] ** 2)
+
+    plt.clf()
+    kwargs.pop("cmap", None)
+    plt.imsave(fname + ".png", fld.T, cmap=cm.nipy_spectral, origin="lower", **kwargs)
+
+
+def save_fields_vtk(fields, timestep, output_dir=".", prefix="fields"):
+    """
+    Save VTK fields to the specified directory.
+
+    Parameters
+    ----------
+    timestep (int): The timestep number to be associated with the saved fields.
+    fields (Dict[str, np.ndarray]): A dictionary of fields to be saved. Each field must be an array-like object
+        with dimensions (nx, ny) for 2D fields or (nx, ny, nz) for 3D fields, where:
+            - nx : int, number of grid points along the x-axis
+            - ny : int, number of grid points along the y-axis
+            - nz : int, number of grid points along the z-axis (for 3D fields only)
+        The key value for each field in the dictionary must be a string containing the name of the field.
+    output_dir (str, optional, default: '.'): The directory in which to save the VTK files. Defaults to the current directory.
+    prefix (str, optional, default: 'fields'): A prefix to be added to the filename. Defaults to 'fields'.
+
+    Returns
+    -------
+    None
+
+    Notes
+    -----
+    This function saves the VTK fields in the specified directory, with filenames based on the provided timestep number
+    and the filename. For example, if the timestep number is 10 and the file name is fields, the VTK file
+    will be saved as 'fields_0000010.vtk'in the specified directory.
+
+    """
+    # Assert that all fields have the same dimensions
+    for key, value in fields.items():
+        if key == list(fields.keys())[0]:
+            dimensions = value.shape
+        else:
+            assert value.shape == dimensions, "All fields must have the same dimensions!"
+
+    output_filename = os.path.join(output_dir, prefix + "_" + f"{timestep:07d}.vtk")
+
+    # Add 1 to the dimensions tuple as we store cell values
+    dimensions = tuple([dim + 1 for dim in dimensions])
+
+    # Create a uniform grid
+    if value.ndim == 2:
+        dimensions = dimensions + (1,)
+
+    grid = pv.ImageData(dimensions=dimensions)
+
+    # Add the fields to the grid
+    for key, value in fields.items():
+        grid[key] = value.flatten(order="F")
+
+    # Save the grid to a VTK file
+    start = time()
+    grid.save(output_filename, binary=True)
+    print(f"Saved {output_filename} in {time() - start:.6f} seconds.")
+
+
+def save_BCs_vtk(timestep, BCs, gridInfo, output_dir="."):
+    """
+    Save boundary conditions as VTK format to the specified directory.
+
+    Parameters
+    ----------
+    timestep (int): The timestep number to be associated with the saved fields.
+    BCs (List[BC]): A list of boundary conditions to be saved. Each boundary condition must be an object of type BC.
+
+    Returns
+    -------
+    None
+
+    Notes
+    -----
+    This function saves the boundary conditions in the specified directory, with filenames based on the provided timestep number
+    and the filename. For example, if the timestep number is 10, the VTK file
+    will be saved as 'BCs_0000010.vtk'in the specified directory.
+    """
+
+    # Create a uniform grid
+    if gridInfo["nz"] == 0:
+        gridDimensions = (gridInfo["nx"] + 1, gridInfo["ny"] + 1, 1)
+        fieldDimensions = (gridInfo["nx"], gridInfo["ny"], 1)
+    else:
+        gridDimensions = (gridInfo["nx"] + 1, gridInfo["ny"] + 1, gridInfo["nz"] + 1)
+        fieldDimensions = (gridInfo["nx"], gridInfo["ny"], gridInfo["nz"])
+
+    grid = pv.ImageData(dimensions=gridDimensions)
+
+    # Dictionary to keep track of encountered BC names
+    bcNamesCount = {}
+
+    for bc in BCs:
+        bcName = bc.name
+        if bcName in bcNamesCount:
+            bcNamesCount[bcName] += 1
+        else:
+            bcNamesCount[bcName] = 0
+        bcName += f"_{bcNamesCount[bcName]}"
+
+        if bc.isDynamic:
+            bcIndices, _ = bc.update_function(timestep)
+        else:
+            bcIndices = bc.indices
+
+        # Convert indices to 1D indices
+        if gridInfo["dim"] == 2:
+            bcIndices = np.ravel_multi_index(bcIndices, fieldDimensions[:-1], order="F")
+        else:
+            bcIndices = np.ravel_multi_index(bcIndices, fieldDimensions, order="F")
+
+        grid[bcName] = np.zeros(fieldDimensions, dtype=bool).flatten(order="F")
+        grid[bcName][bcIndices] = True
+
+    # Save the grid to a VTK file
+    output_filename = os.path.join(output_dir, "BCs_" + f"{timestep:07d}.vtk")
+
+    start = time()
+    grid.save(output_filename, binary=True)
+    print(f"Saved {output_filename} in {time() - start:.6f} seconds.")
+
+
+def rotate_geometry(indices, origin, axis, angle):
+    """
+    Rotates a voxelized mesh around a given axis.
+
+    Parameters
+    ----------
+    indices : array-like
+        The indices of the voxels in the mesh.
+    origin : array-like
+        The coordinates of the origin of the rotation axis.
+    axis : array-like
+        The direction vector of the rotation axis. This should be a 3-element sequence.
+    angle : float
+        The angle by which to rotate the mesh, in radians.
+
+    Returns
+    -------
+    tuple
+        The indices of the voxels in the rotated mesh.
+
+    Notes
+    -----
+    This function rotates the mesh by applying a rotation matrix to the voxel indices. The rotation matrix is calculated
+    using the axis-angle representation of rotations. The origin of the rotation axis is assumed to be at (0, 0, 0).
+    """
+    indices_rotated = (jnp.array(indices).T - origin) @ axangle2mat(axis, angle) + origin
+    return tuple(jnp.rint(indices_rotated).astype("int32").T)
+
+
+def voxelize_stl(stl_filename, length_lbm_unit=None, tranformation_matrix=None, pitch=None):
+    """
+    Converts an STL file to a voxelized mesh.
+
+    Parameters
+    ----------
+    stl_filename : str
+        The name of the STL file to be voxelized.
+    length_lbm_unit : float, optional
+        The unit length in LBM. Either this or 'pitch' must be provided.
+    tranformation_matrix : array-like, optional
+        A transformation matrix to be applied to the mesh before voxelization.
+    pitch : float, optional
+        The pitch of the voxel grid. Either this or 'length_lbm_unit' must be provided.
+
+    Returns
+    -------
+    trimesh.VoxelGrid, float
+        The voxelized mesh and the pitch of the voxel grid.
+
+    Notes
+    -----
+    This function uses the trimesh library to load the STL file and voxelized the mesh. If a transformation matrix is
+    provided, it is applied to the mesh before voxelization. The pitch of the voxel grid is calculated based on the
+    maximum extent of the mesh and the provided lattice Boltzmann unit length, unless a pitch is provided directly.
+    """
+    if length_lbm_unit is None and pitch is None:
+        raise ValueError("Either 'length_lbm_unit' or 'pitch' must be provided!")
+    mesh = trimesh.load_mesh(stl_filename, process=False)
+    length_phys_unit = mesh.extents.max()
+    if tranformation_matrix is not None:
+        mesh.apply_transform(tranformation_matrix)
+    if pitch is None:
+        pitch = length_phys_unit / length_lbm_unit
+    mesh_voxelized = mesh.voxelized(pitch=pitch)
+    return mesh_voxelized, pitch
+
+
+def axangle2mat(axis, angle, is_normalized=False):
+    """Rotation matrix for rotation angle `angle` around `axis`
+    Parameters
+    ----------
+    axis : 3 element sequence
+       vector specifying axis for rotation.
+    angle : scalar
+       angle of rotation in radians.
+    is_normalized : bool, optional
+       True if `axis` is already normalized (has norm of 1).  Default False.
+    Returns
+    -------
+    mat : array shape (3,3)
+       rotation matrix for specified rotation
+    Notes
+    -----
+    From : https://github.com/matthew-brett/transforms3d
+    Ref : http://en.wikipedia.org/wiki/Rotation_matrix#Axis_and_angle
+    """
+    x, y, z = axis
+    if not is_normalized:
+        n = jnp.sqrt(x * x + y * y + z * z)
+        x = x / n
+        y = y / n
+        z = z / n
+    c = jnp.cos(angle)
+    s = jnp.sin(angle)
+    C = 1 - c
+    xs = x * s
+    ys = y * s
+    zs = z * s
+    xC = x * C
+    yC = y * C
+    zC = z * C
+    xyC = x * yC
+    yzC = y * zC
+    zxC = z * xC
+    return jnp.array([
+        [x * xC + c, xyC - zs, zxC + ys],
+        [xyC + zs, y * yC + c, yzC - xs],
+        [zxC - ys, yzC + xs, z * zC + c],
+    ])

From 0086ecc42bd742cc312b6e2de4d1ee85e2dbc5ad Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 17 Jun 2025 10:40:37 -0400
Subject: [PATCH 073/208] added a generic read and write method to further
 unify Warp and Neon backends

---
 xlb/operator/boundary_masker/aabb.py | 14 ++++----
 xlb/operator/operator.py             | 54 ++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/xlb/operator/boundary_masker/aabb.py b/xlb/operator/boundary_masker/aabb.py
index e7533c89..dfa52dd8 100644
--- a/xlb/operator/boundary_masker/aabb.py
+++ b/xlb/operator/boundary_masker/aabb.py
@@ -50,11 +50,9 @@ def kernel(
             cell_center_pos = self.index_to_position(index)
             HALF_VOXEL = wp.vec3(0.5, 0.5, 0.5)
 
-            if bc_mask[0, index[0], index[1], index[2]] == wp.uint8(255) or self.mesh_voxel_intersect(
-                mesh_id=mesh_id, low=cell_center_pos - HALF_VOXEL
-            ):
+            if self.read_field(bc_mask, index, 0) == wp.uint8(255) or self.mesh_voxel_intersect(mesh_id=mesh_id, low=cell_center_pos - HALF_VOXEL):
                 # Make solid voxel
-                bc_mask[0, index[0], index[1], index[2]] = wp.uint8(255)
+                self.write_field(bc_mask, index, 0, wp.uint8(255))
             else:
                 # Find the boundary voxels and their missing directions
                 for direction_idx in range(1, _q):
@@ -64,8 +62,8 @@ def kernel(
                     if self.mesh_voxel_intersect(mesh_id=mesh_id, low=cell_center_pos + direction_vec - HALF_VOXEL):
                         # We know we have a solid neighbor
                         # Set the boundary id and missing_mask
-                        bc_mask[0, index[0], index[1], index[2]] = wp.uint8(id_number)
-                        missing_mask[_opp_indices[direction_idx], index[0], index[1], index[2]] = wp.uint8(True)
+                        self.write_field(bc_mask, index, 0, wp.uint8(id_number))
+                        self.write_field(missing_mask, index, _opp_indices[direction_idx], wp.uint8(True))
 
                         # If we don't need the mesh distance, we can return early
                         if not needs_mesh_distance:
@@ -81,11 +79,11 @@ def kernel(
                             # We reduce the distance to give some wall thickness
                             dist = wp.length(pos_mesh - cell_center_pos) - 0.5 * max_length
                             weight = self.store_dtype(dist / max_length)
-                            distances[direction_idx, index[0], index[1], index[2]] = weight
+                            self.write_field(distances, index, direction_idx, weight)
                         else:
                             # Expected an intersection in this direction but none was found.
                             # Assume the solid extends one lattice unit beyond the BC voxel leading to a distance fraction of 1.
-                            distances[direction_idx, index[0], index[1], index[2]] = self.store_dtype(1.0)
+                            self.write_field(distances, index, direction_idx, self.store_dtype(1.0))
 
         return None, kernel
 
diff --git a/xlb/operator/operator.py b/xlb/operator/operator.py
index cc4d7c79..cefbf51c 100644
--- a/xlb/operator/operator.py
+++ b/xlb/operator/operator.py
@@ -1,6 +1,8 @@
 import inspect
 import traceback
 import jax
+import warp as wp
+from typing import Any
 
 from xlb.compute_backend import ComputeBackend
 from xlb import DefaultConfig
@@ -26,6 +28,10 @@ def __init__(self, velocity_set=None, precision_policy=None, compute_backend=Non
         if self.compute_backend not in ComputeBackend:
             raise ValueError(f"Compute_backend {compute_backend} is not supported")
 
+        # Construct read/write functions for the compute backend
+        if self.compute_backend in [ComputeBackend.WARP, ComputeBackend.NEON]:
+            self.read_field, self.write_field = self._construct_read_write_functions()
+
         # Construct the kernel based compute_backend functions TODO: Maybe move this to the register or something
         if self.compute_backend == ComputeBackend.WARP:
             self.warp_functional, self.warp_kernel = self._construct_warp()
@@ -158,3 +164,51 @@ def _construct_neon(self):
         Leave it for now, as it is not clear how the neon backend will evolve
         """
         return None, None
+
+    def _construct_read_write_functions(self):
+        if self.compute_backend == ComputeBackend.WARP:
+
+            @wp.func
+            def read_field(
+                field: Any,
+                index: Any,
+                direction: Any,
+            ):
+                # This function reads a field value at a given index and direction.
+                return field[direction, index[0], index[1], index[2]]
+
+            @wp.func
+            def write_field(
+                field: Any,
+                index: Any,
+                direction: Any,
+                value: Any,
+            ):
+                # This function writes a value to a field at a given index and direction.
+                field[direction, index[0], index[1], index[2]] = value
+
+        elif self.compute_backend == ComputeBackend.NEON:
+
+            @wp.func
+            def read_field(
+                field: Any,
+                index: Any,
+                direction: Any,
+            ):
+                # This function reads a field value at a given index and direction.
+                return wp.neon_read(field, index, direction)
+
+            @wp.func
+            def write_field(
+                field: Any,
+                index: Any,
+                direction: Any,
+                value: Any,
+            ):
+                # This function writes a value to a field at a given index and direction.
+                wp.neon_write(field, index, direction, value)
+
+        else:
+            raise ValueError(f"Unsupported compute backend: {self.compute_backend}")
+
+        return read_field, write_field

From f8d239c834d606558efccdc01ee59e322fecc89e Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 17 Jun 2025 17:44:17 -0400
Subject: [PATCH 074/208] added store_dtype to neon backend

---
 .../boundary_condition/boundary_condition.py         | 12 ++++--------
 xlb/operator/operator.py                             |  2 ++
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/xlb/operator/boundary_condition/boundary_condition.py b/xlb/operator/boundary_condition/boundary_condition.py
index 98fce104..14bf7033 100644
--- a/xlb/operator/boundary_condition/boundary_condition.py
+++ b/xlb/operator/boundary_condition/boundary_condition.py
@@ -236,14 +236,12 @@ def aux_data_init_cl(index: Any):
                         # wp.neon_write(f_pn, index, l, self.store_dtype(feq[l]))
                         if l == lattice_central_index:
                             # The first BC auxiliary data is stored in the zero'th index of f_1 associated with its center.
-                            # TODO: add self.store_dtype
-                            wp.neon_write(f_1_pn, index, l, prescribed_values[l])
+                            wp.neon_write(f_1_pn, index, l, self.store_dtype(prescribed_values[l]))
                             counter += 1
                         elif _missing_mask[l] == wp.uint8(1):
                             # The other remaining BC auxiliary data are stored in missing directions of f_1.
                             # Only store up to num_of_aux_data
-                            # TODO: add self.store_dtype
-                            wp.neon_write(f_1_pn, index, _opp_indices[l], prescribed_values[l])
+                            wp.neon_write(f_1_pn, index, _opp_indices[l], self.store_dtype(prescribed_values[l]))
                             counter += 1
                         if counter > _num_of_aux_data:
                             # Only store up to num_of_aux_data
@@ -327,14 +325,12 @@ def aux_data_init_cl(index: Any):
                         # wp.neon_write(f_pn, index, l, self.store_dtype(feq[l]))
                         if l == lattice_central_index:
                             # The first BC auxiliary data is stored in the zero'th index of f_1 associated with its center.
-                            # TODO: add self.store_dtype
-                            wp.neon_write(f_1_pn, index, l, prescribed_values[l])
+                            wp.neon_write(f_1_pn, index, l, self.store_dtype(prescribed_values[l]))
                             counter += 1
                         elif _missing_mask[l] == wp.uint8(1):
                             # The other remaining BC auxiliary data are stored in missing directions of f_1.
                             # Only store up to num_of_aux_data
-                            # TODO: add self.store_dtype
-                            wp.neon_write(f_1_pn, index, _opp_indices[l], prescribed_values[l])
+                            wp.neon_write(f_1_pn, index, _opp_indices[l], self.store_dtype(prescribed_values[l]))
                             counter += 1
                         if counter > _num_of_aux_data:
                             # Only store up to num_of_aux_data
diff --git a/xlb/operator/operator.py b/xlb/operator/operator.py
index cefbf51c..30a06c32 100644
--- a/xlb/operator/operator.py
+++ b/xlb/operator/operator.py
@@ -134,6 +134,8 @@ def store_dtype(self):
             return self.precision_policy.store_precision.jax_dtype
         elif self.compute_backend == ComputeBackend.WARP:
             return self.precision_policy.store_precision.wp_dtype
+        elif self.compute_backend == ComputeBackend.NEON:
+            return self.precision_policy.store_precision.wp_dtype
 
     def get_precision_policy(self):
         """

From 1e17d71c6831fd7cf6648712bf75d929446f31dc Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 17 Jun 2025 23:46:32 -0400
Subject: [PATCH 075/208] Added neon backend to AABB mesh masker using generic
 function calls.

---
 xlb/operator/boundary_masker/aabb.py          | 121 +++++++++++++++---
 .../indices_boundary_masker.py                |   6 +-
 .../boundary_masker/mesh_boundary_masker.py   |  84 +++++++++---
 3 files changed, 170 insertions(+), 41 deletions(-)

diff --git a/xlb/operator/boundary_masker/aabb.py b/xlb/operator/boundary_masker/aabb.py
index dfa52dd8..7012e97e 100644
--- a/xlb/operator/boundary_masker/aabb.py
+++ b/xlb/operator/boundary_masker/aabb.py
@@ -5,6 +5,7 @@
 from xlb.compute_backend import ComputeBackend
 from xlb.operator.boundary_masker.mesh_boundary_masker import MeshBoundaryMasker
 from xlb.operator.operator import Operator
+import neon
 
 
 class MeshMaskerAABB(MeshBoundaryMasker):
@@ -31,23 +32,21 @@ def _construct_warp(self):
         _q = self.velocity_set.q
         _opp_indices = self.velocity_set.opp_indices
 
-        @wp.kernel
-        def kernel(
-            mesh_id: wp.uint64,
-            id_number: wp.int32,
-            distances: wp.array4d(dtype=Any),
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.uint8),
-            needs_mesh_distance: bool,
-        ):
-            # get index
-            i, j, k = wp.tid()
-
-            # Get local indices
-            index = wp.vec3i(i, j, k)
+        # Set local constants
+        lattice_central_index = self.velocity_set.center_index
 
+        @wp.func
+        def functional(
+            index: Any,
+            mesh_id: Any,
+            id_number: Any,
+            distances: Any,
+            bc_mask: Any,
+            missing_mask: Any,
+            needs_mesh_distance: Any,
+        ):
             # position of the point
-            cell_center_pos = self.index_to_position(index)
+            cell_center_pos = self.index_to_position(bc_mask, index)
             HALF_VOXEL = wp.vec3(0.5, 0.5, 0.5)
 
             if self.read_field(bc_mask, index, 0) == wp.uint8(255) or self.mesh_voxel_intersect(mesh_id=mesh_id, low=cell_center_pos - HALF_VOXEL):
@@ -55,7 +54,12 @@ def kernel(
                 self.write_field(bc_mask, index, 0, wp.uint8(255))
             else:
                 # Find the boundary voxels and their missing directions
-                for direction_idx in range(1, _q):
+                for direction_idx in range(_q):
+                    if direction_idx == lattice_central_index:
+                        # Skip the central index as it is not relevant for boundary masking
+                        continue
+
+                    # Get the lattice direction vector
                     direction_vec = wp.vec3f(wp.float32(_c[0, direction_idx]), wp.float32(_c[1, direction_idx]), wp.float32(_c[2, direction_idx]))
 
                     # Check to see if this neighbor is solid
@@ -78,14 +82,39 @@ def kernel(
                             pos_mesh = wp.mesh_eval_position(mesh_id, query.face, query.u, query.v)
                             # We reduce the distance to give some wall thickness
                             dist = wp.length(pos_mesh - cell_center_pos) - 0.5 * max_length
-                            weight = self.store_dtype(dist / max_length)
-                            self.write_field(distances, index, direction_idx, weight)
+                            weight = dist / max_length
+                            self.write_field(distances, index, direction_idx, self.store_dtype(weight))
                         else:
                             # Expected an intersection in this direction but none was found.
                             # Assume the solid extends one lattice unit beyond the BC voxel leading to a distance fraction of 1.
                             self.write_field(distances, index, direction_idx, self.store_dtype(1.0))
 
-        return None, kernel
+        @wp.kernel
+        def kernel(
+            mesh_id: wp.uint64,
+            id_number: wp.int32,
+            distances: wp.array4d(dtype=Any),
+            bc_mask: wp.array4d(dtype=wp.uint8),
+            missing_mask: wp.array4d(dtype=wp.uint8),
+            needs_mesh_distance: bool,
+        ):
+            # get index
+            i, j, k = wp.tid()
+
+            # Get local indices
+            index = wp.vec3i(i, j, k)
+
+            functional(
+                index,
+                mesh_id,
+                id_number,
+                distances,
+                bc_mask,
+                missing_mask,
+                needs_mesh_distance,
+            )
+
+        return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(
@@ -101,3 +130,57 @@ def warp_implementation(
             bc_mask,
             missing_mask,
         )
+
+    def _construct_neon(self):
+        # Use the warp functional for the NEON backend
+        functional, _ = self._construct_warp()
+
+        @neon.Container.factory(name="MeshMaskerAABB")
+        def container(
+            mesh_id: Any,
+            id_number: Any,
+            distances: Any,
+            bc_mask: Any,
+            missing_mask: Any,
+            needs_mesh_distance: Any,
+        ):
+            def aabb_launcher(loader: neon.Loader):
+                loader.set_grid(bc_mask.get_grid())
+                bc_mask_pn = loader.get_write_handle(bc_mask)
+                missing_mask_pn = loader.get_write_handle(missing_mask)
+                distances_pn = loader.get_write_handle(distances)
+
+                @wp.func
+                def aabb_kernel(index: Any):
+                    # apply the functional
+                    functional(
+                        index,
+                        mesh_id,
+                        id_number,
+                        distances_pn,
+                        bc_mask_pn,
+                        missing_mask_pn,
+                        needs_mesh_distance,
+                    )
+
+                loader.declare_kernel(aabb_kernel)
+
+            return aabb_launcher
+
+        return functional, container
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(
+        self,
+        bc,
+        distances,
+        bc_mask,
+        missing_mask,
+    ):
+        # Prepare inputs
+        mesh_id, bc_id = self._prepare_kernel_inputs(bc, bc_mask)
+
+        # Launch the appropriate neon container
+        c = self.neon_container(mesh_id, bc_id, distances, bc_mask, missing_mask, wp.static(bc.needs_mesh_distance))
+        c.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
+        return distances, bc_mask, missing_mask
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 48d78f41..81035289 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -162,7 +162,7 @@ def kernel(
         return None, kernel
 
     # a helper for this operator
-    def _prepare_warp_kernel_inputs(self, bclist, bc_mask):
+    def _prepare_kernel_inputs(self, bclist, bc_mask):
         # Pre-allocate arrays with maximum possible size
         max_size = sum(len(bc.indices[0]) if isinstance(bc.indices, list) else bc.indices.shape[1] for bc in bclist if bc.indices is not None)
         indices = np.zeros((3, max_size), dtype=np.int32)
@@ -214,7 +214,7 @@ def _prepare_warp_kernel_inputs(self, bclist, bc_mask):
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
         # prepare warp kernel inputs
-        total_index, wp_indices, wp_id_numbers, wp_is_interior = self._prepare_warp_kernel_inputs(bclist, bc_mask)
+        total_index, wp_indices, wp_id_numbers, wp_is_interior = self._prepare_kernel_inputs(bclist, bc_mask)
 
         # Launch the warp kernel
         wp.launch(
@@ -239,7 +239,7 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
         _d = self.velocity_set.d
 
         # Pre-allocate arrays with maximum possible size
-        grid_shape = bc_mask.shape[1:]  # (nx, ny) for 2D or (nx, ny, nz) for 3D
+        grid_shape = bc_mask.get_grid().dim.x, bc_mask.get_grid().dim.y, bc_mask.get_grid().dim.z
         grid_warp = grid_factory(grid_shape, compute_backend=ComputeBackend.WARP, velocity_set=self.velocity_set)
         missing_mask_warp = grid_warp.create_field(cardinality=self.velocity_set.q, dtype=Precision.UINT8)
         bc_mask_warp = grid_warp.create_field(cardinality=1, dtype=Precision.UINT8)
diff --git a/xlb/operator/boundary_masker/mesh_boundary_masker.py b/xlb/operator/boundary_masker/mesh_boundary_masker.py
index af8efa6e..b2740631 100644
--- a/xlb/operator/boundary_masker/mesh_boundary_masker.py
+++ b/xlb/operator/boundary_masker/mesh_boundary_masker.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 import warp as wp
+from typing import Any
 from xlb.velocity_set.velocity_set import VelocitySet
 from xlb.precision_policy import PrecisionPolicy
 from xlb.compute_backend import ComputeBackend
@@ -22,6 +23,11 @@ def __init__(
         # Call super
         super().__init__(velocity_set, precision_policy, compute_backend)
 
+        assert self.compute_backend in [ComputeBackend.WARP, ComputeBackend.NEON], (
+            f"MeshBoundaryMasker is only implemented for {ComputeBackend.WARP} and {ComputeBackend.NEON} backends!"
+        )
+
+        assert self.velocity_set.d == 3, "MeshBoundaryMasker is only implemented for 3D velocity sets!"
         # Raise error if used for 2d examples:
         if self.velocity_set.d == 2:
             raise NotImplementedError("This Operator is not implemented in 2D!")
@@ -29,14 +35,37 @@ def __init__(
         # Make constants for warp
         _c = self.velocity_set.c
         _q = self.velocity_set.q
+        _d = self.velocity_set.d
+
+        @wp.func
+        def neon_index_to_warp(neon_field_hdl: Any, index: Any):
+            # Unpack the global index in Neon
+            cIdx = wp.neon_global_idx(neon_field_hdl, index)
+            gx = wp.neon_get_x(cIdx)
+            gy = wp.neon_get_y(cIdx)
+            gz = wp.neon_get_z(cIdx)
+
+            # TODO@Max - XLB is flattening the z dimension in 3D, while neon uses the y dimension
+            if _d == 2:
+                gy, gz = gz, gy
+
+            # Get warp indices
+            index_wp = wp.vec3i(gx, gy, gz)
+            return index_wp
 
         @wp.func
-        def index_to_position(index: wp.vec3i):
+        def index_to_position_warp(field: Any, index: wp.vec3i):
             # position of the point
             ijk = wp.vec3(wp.float32(index[0]), wp.float32(index[1]), wp.float32(index[2]))
             pos = ijk + wp.vec3(0.5, 0.5, 0.5)  # cell center
             return pos
 
+        @wp.func
+        def index_to_position_neon(field: Any, index: Any):
+            # position of the point
+            index_wp = neon_index_to_warp(field, index)
+            return index_to_position_warp(field, index_wp)
+
         @wp.func
         def is_in_bounds(index: wp.vec3i, domain_shape: wp.vec3i):
             return (
@@ -173,28 +202,25 @@ def resolve_out_of_bound_kernel(
                         missing_mask[l, index[0], index[1], index[2]] = wp.uint8(True)
 
         # Construct some helper warp functions
+        self.index_to_position = index_to_position_warp if self.compute_backend == ComputeBackend.WARP else index_to_position_neon
+        self.mesh_voxel_intersect = mesh_voxel_intersect
+        self.resolve_out_of_bound_kernel = resolve_out_of_bound_kernel
+
+    def get_grid_shape(self, bc_mask):
+        """
+        Get the grid shape from the boundary mask.
+        """
         if self.compute_backend == ComputeBackend.WARP:
-            self.index_to_position = index_to_position
-            self.is_in_bounds = is_in_bounds
-            self.out_of_bound_pull_index = out_of_bound_pull_index
-            self.mesh_voxel_intersect = mesh_voxel_intersect
-            self.resolve_out_of_bound_kernel = resolve_out_of_bound_kernel
+            return bc_mask.shape[1:]
+        elif self.compute_backend == ComputeBackend.NEON:
+            return bc_mask.get_grid().dim.x, bc_mask.get_grid().dim.y, bc_mask.get_grid().dim.z
+        else:
+            raise ValueError(f"Unsupported compute backend: {self.compute_backend}")
 
-    @Operator.register_backend(ComputeBackend.JAX)
-    def jax_implementation(
-        self,
-        bc,
-        bc_mask,
-        missing_mask,
-    ):
-        raise NotImplementedError(f"Operation {self.__class__.__name__} not implemented in JAX!")
-
-    def warp_implementation_base(
+    def _prepare_kernel_inputs(
         self,
         bc,
-        distances,
         bc_mask,
-        missing_mask,
     ):
         assert bc.mesh_vertices is not None, f'Please provide the mesh vertices for {bc.__class__.__name__} BC using keyword "mesh_vertices"!'
         assert bc.indices is None, f"Please use IndicesBoundaryMasker operator if {bc.__class__.__name__} is imposed on known indices of the grid!"
@@ -202,7 +228,7 @@ def warp_implementation_base(
             "Mesh points must be reshaped into an array (N, 3) where N indicates number of points!"
         )
 
-        domain_shape = bc_mask.shape[1:]  # (nx, ny, nz)
+        domain_shape = self.get_grid_shape(bc_mask)  # (nx, ny, nz)
         mesh_vertices = bc.mesh_vertices
         mesh_min = np.min(mesh_vertices, axis=0)
         mesh_max = np.max(mesh_vertices, axis=0)
@@ -222,6 +248,26 @@ def warp_implementation_base(
         )
         mesh_id = wp.uint64(mesh.id)
         bc_id = bc.id
+        return mesh_id, bc_id
+
+    @Operator.register_backend(ComputeBackend.JAX)
+    def jax_implementation(
+        self,
+        bc,
+        bc_mask,
+        missing_mask,
+    ):
+        raise NotImplementedError(f"Operation {self.__class__.__name__} not implemented in JAX!")
+
+    def warp_implementation_base(
+        self,
+        bc,
+        distances,
+        bc_mask,
+        missing_mask,
+    ):
+        # Prepare inputs
+        mesh_id, bc_id = self._prepare_kernel_inputs(bc, bc_mask)
 
         # Launch the appropriate warp kernel
         wp.launch(

From 33778066f74a55d793d0a499169c98da9547883f Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Wed, 18 Jun 2025 09:59:54 -0400
Subject: [PATCH 076/208] added multires mesh masker (AABB).

---
 xlb/operator/boundary_masker/__init__.py      |  1 +
 xlb/operator/boundary_masker/multires_aabb.py | 87 +++++++++++++++++++
 xlb/operator/stepper/nse_multires_stepper.py  | 55 ++++++++++--
 3 files changed, 137 insertions(+), 6 deletions(-)
 create mode 100644 xlb/operator/boundary_masker/multires_aabb.py

diff --git a/xlb/operator/boundary_masker/__init__.py b/xlb/operator/boundary_masker/__init__.py
index 82fd4744..b3443cc7 100644
--- a/xlb/operator/boundary_masker/__init__.py
+++ b/xlb/operator/boundary_masker/__init__.py
@@ -6,3 +6,4 @@
 from xlb.operator.boundary_masker.aabb_fill import MeshMaskerAABBFill
 from xlb.operator.boundary_masker.mesh_voxelization_method import MeshVoxelizationMethod
 from xlb.operator.boundary_masker.multires_boundary_masker import MultiresBoundaryMasker
+from xlb.operator.boundary_masker.multires_aabb import MultiresMeshMaskerAABB
diff --git a/xlb/operator/boundary_masker/multires_aabb.py b/xlb/operator/boundary_masker/multires_aabb.py
new file mode 100644
index 00000000..b5367b9f
--- /dev/null
+++ b/xlb/operator/boundary_masker/multires_aabb.py
@@ -0,0 +1,87 @@
+import warp as wp
+from typing import Any
+from xlb.velocity_set.velocity_set import VelocitySet
+from xlb.precision_policy import PrecisionPolicy
+from xlb.compute_backend import ComputeBackend
+from xlb.operator.boundary_masker import MeshMaskerAABB
+from xlb.operator.operator import Operator
+import neon
+
+
+class MultiresMeshMaskerAABB(MeshMaskerAABB):
+    """
+    Operator for creating boundary missing_mask from mesh using Axis-Aligned Bounding Box (AABB) voxelization in multiresolution simulations.
+
+    This implementation uses warp.mesh_query_aabb for efficient mesh-voxel intersection testing,
+    providing approximate 1-voxel thick surface detection around the mesh geometry.
+    Suitable for scenarios where fast, approximate boundary detection is sufficient.
+    """
+
+    def __init__(
+        self,
+        velocity_set: VelocitySet,
+        precision_policy: PrecisionPolicy,
+        compute_backend: ComputeBackend.WARP,
+    ):
+        # Call super
+        super().__init__(velocity_set, precision_policy, compute_backend)
+        if self.compute_backend in [ComputeBackend.JAX, ComputeBackend.WARP]:
+            raise NotImplementedError(f"Operator {self.__class__.__name__} not supported in {self.compute_backend} backend.")
+
+    def _construct_neon(self):
+        # Use the warp functional for the NEON backend
+        functional, _ = self._construct_warp()
+
+        @neon.Container.factory(name="MeshMaskerAABB")
+        def container(
+            mesh_id: Any,
+            id_number: Any,
+            distances: Any,
+            bc_mask: Any,
+            missing_mask: Any,
+            needs_mesh_distance: Any,
+            level: Any,
+        ):
+            def aabb_launcher(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask.get_grid(), level)
+                distances_pn = loader.get_mres_write_handle(distances)
+                bc_mask_pn = loader.get_mres_write_handle(bc_mask)
+                missing_mask_pn = loader.get_mres_write_handle(missing_mask)
+
+                @wp.func
+                def aabb_kernel(index: Any):
+                    # apply the functional
+                    functional(
+                        index,
+                        mesh_id,
+                        id_number,
+                        distances_pn,
+                        bc_mask_pn,
+                        missing_mask_pn,
+                        needs_mesh_distance,
+                    )
+
+                loader.declare_kernel(aabb_kernel)
+
+            return aabb_launcher
+
+        return functional, container
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(
+        self,
+        bc,
+        distances,
+        bc_mask,
+        missing_mask,
+        stream=0,
+    ):
+        # Prepare inputs
+        mesh_id, bc_id = self._prepare_kernel_inputs(bc, bc_mask)
+
+        grid = bc_mask.get_grid()
+        for level in range(grid.num_levels):
+            # Launch the neon container
+            c = self.neon_container(mesh_id, bc_id, distances, bc_mask, missing_mask, wp.static(bc.needs_mesh_distance), level)
+            c.run(stream, container_runtime=neon.Container.ContainerRuntime.neon)
+        return distances, bc_mask, missing_mask
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index ec69007c..2309fd14 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -22,7 +22,11 @@
 from xlb.operator.collision import ForcedCollision
 from xlb.operator.boundary_masker import MultiresBoundaryMasker
 from xlb.helper import check_bc_overlaps
-
+from xlb.operator.boundary_masker import (
+    IndicesBoundaryMasker,
+    MeshVoxelizationMethod,
+    MultiresMeshMaskerAABB,
+)
 
 class MultiresIncompressibleNavierStokesStepper(Stepper):
     def __init__(
@@ -82,7 +86,7 @@ def prepare_fields(self, rho, u, initializer=None):
         # f_0.export_vti("f0_eq_init.vti", "init_f0")
 
         # Process boundary conditions and update masks
-        f_1, bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, f_1, bc_mask, missing_mask, xlb_grid=self.grid)
+        f_1, bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, f_1, bc_mask, missing_mask)
         # Initialize auxiliary data if needed
         f_1 = self._initialize_auxiliary_data(self.boundary_conditions, f_1, bc_mask, missing_mask)
         # bc_mask.update_host(0)
@@ -208,19 +212,58 @@ def compute(index: Any):
         return
 
     @classmethod
-    def _process_boundary_conditions(cls, boundary_conditions, f_1, bc_mask, missing_mask, xlb_grid=None):
+    def _process_boundary_conditions(cls, boundary_conditions, f_1, bc_mask, missing_mask):
         """Process boundary conditions and update boundary masks."""
+
         # Check for boundary condition overlaps
         # TODO! check_bc_overlaps(boundary_conditions, DefaultConfig.velocity_set.d, DefaultConfig.default_backend)
+
         # Create boundary maskers
-        mres_masker = MultiresBoundaryMasker(
+        indices_masker = IndicesBoundaryMasker(
             velocity_set=DefaultConfig.velocity_set,
             precision_policy=DefaultConfig.default_precision_policy,
             compute_backend=DefaultConfig.default_backend,
         )
 
-        # Process all boundary conditions, either defined by indices or mesh_vertices
-        f_1, bc_mask, missing_mask = mres_masker(boundary_conditions, f_1, bc_mask, missing_mask, xlb_grid=xlb_grid)
+        # Split boundary conditions by type
+        bc_with_vertices = [bc for bc in boundary_conditions if bc.mesh_vertices is not None]
+        bc_with_indices = [bc for bc in boundary_conditions if bc.indices is not None]
+
+        # Process indices-based boundary conditions
+        # if bc_with_indices:
+        #     bc_mask, missing_mask = indices_masker(bc_with_indices, bc_mask, missing_mask)
+
+        # Process mesh-based boundary conditions for 3D
+        if DefaultConfig.velocity_set.d == 3 and bc_with_vertices:
+            for bc in bc_with_vertices:
+                if bc.voxelization_method is MeshVoxelizationMethod.AABB:
+                    mesh_masker = MultiresMeshMaskerAABB(
+                        velocity_set=DefaultConfig.velocity_set,
+                        precision_policy=DefaultConfig.default_precision_policy,
+                        compute_backend=DefaultConfig.default_backend,
+                    )
+                # elif bc.voxelization_method is MeshVoxelizationMethod.RAY:
+                #     mesh_masker = MeshMaskerRay(
+                #         velocity_set=DefaultConfig.velocity_set,
+                #         precision_policy=DefaultConfig.default_precision_policy,
+                #         compute_backend=DefaultConfig.default_backend,
+                #     )
+                # elif bc.voxelization_method is MeshVoxelizationMethod.WINDING:
+                #     mesh_masker = MeshMaskerWinding(
+                #         velocity_set=DefaultConfig.velocity_set,
+                #         precision_policy=DefaultConfig.default_precision_policy,
+                #         compute_backend=DefaultConfig.default_backend,
+                #     )
+                # elif bc.voxelization_method is MeshVoxelizationMethod.AABB_FILL:
+                #     mesh_masker = MeshMaskerAABBFill(
+                #         velocity_set=DefaultConfig.velocity_set,
+                #         precision_policy=DefaultConfig.default_precision_policy,
+                #         compute_backend=DefaultConfig.default_backend,
+                #     )
+                else:
+                    raise ValueError(f"Unsupported voxelization method: {bc.voxelization_method}")
+                # Apply the mesh masker to the boundary condition
+                f_1, bc_mask, missing_mask = mesh_masker(bc, f_1, bc_mask, missing_mask)
 
         return f_1, bc_mask, missing_mask
 

From e8709f29eb840d1d2197dbcb30f88018262285ff Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Wed, 18 Jun 2025 16:59:38 -0400
Subject: [PATCH 077/208] Added moving boundary capability of halfway BC also
 to the JAX backend.

---
 .../bc_halfway_bounce_back.py                 | 46 ++++++++++++-------
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
index cf4f4259..ee7fc938 100644
--- a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
@@ -80,19 +80,25 @@ def __init__(
             else:
                 raise ValueError("Velocity prescribed_value must be a tuple, list, or array")
 
-            # Handle 2D velocity sets
-            if self.velocity_set.d == 2:
-                assert len(prescribed_value) == 2, "For 2D velocity set, prescribed_value must be a tuple or array of length 2!"
-                prescribed_value = np.array([prescribed_value[0], prescribed_value[1], 0.0], dtype=np.float64)
-
-            # create a constant prescribed profile
-            prescribed_value = wp.vec(3, dtype=self.compute_dtype)(prescribed_value)
+            # Create a constant prescribed profile function
+            if self.compute_backend == ComputeBackend.WARP:
+                if self.velocity_set.d == 2:
+                    prescribed_value = np.array([prescribed_value[0], prescribed_value[1], 0.0], dtype=np.float64)
+                prescribed_value = wp.vec(3, dtype=self.precision_policy.store_precision.wp_dtype)(prescribed_value)
+            self.profile = self._create_constant_prescribed_profile(prescribed_value)
+
+    def _create_constant_prescribed_profile(self, prescribed_value):
+        @wp.func
+        def prescribed_profile_warp(index: wp.vec3i, time: Any):
+            return wp.vec3(prescribed_value[0], prescribed_value[1], prescribed_value[2])
 
-            @wp.func
-            def prescribed_profile_warp(index: wp.vec3i, time: Any):
-                return wp.vec3(prescribed_value[0], prescribed_value[1], prescribed_value[2])
+        def prescribed_profile_jax():
+            return jnp.array(prescribed_value, dtype=self.precision_policy.store_precision.jax_dtype).reshape(-1, 1)
 
-            self.profile = prescribed_profile_warp
+        if self.compute_backend == ComputeBackend.JAX:
+            return prescribed_profile_jax
+        elif self.compute_backend == ComputeBackend.WARP:
+            return prescribed_profile_warp
 
     @Operator.register_backend(ComputeBackend.JAX)
     @partial(jit, static_argnums=(0))
@@ -100,11 +106,19 @@ def jax_implementation(self, f_pre, f_post, bc_mask, missing_mask):
         boundary = bc_mask == self.id
         new_shape = (self.velocity_set.q,) + boundary.shape[1:]
         boundary = lax.broadcast_in_dim(boundary, new_shape, tuple(range(self.velocity_set.d + 1)))
-        return jnp.where(
-            jnp.logical_and(missing_mask, boundary),
-            f_pre[self.velocity_set.opp_indices],
-            f_post,
-        )
+
+        # Add contribution due to moving_wall to f_missing
+        moving_wall_component = 0.0
+        if self.needs_moving_wall_treatment:
+            u_wall = self.profile()
+            cu = self.velocity_set.w[:, None] * jnp.tensordot(self.velocity_set.c, u_wall, axes=(0, 0))
+            cu = cu.reshape((-1,) + (1,) * (len(f_post[1:].shape) - 1))
+            moving_wall_component = 6.0 * cu
+
+        # Apply the halfway bounce-back condition
+        f_post = jnp.where(jnp.logical_and(missing_mask, boundary), f_pre[self.velocity_set.opp_indices] + moving_wall_component, f_post)
+
+        return f_post
 
     def _construct_warp(self):
         # load helper functions

From 13fd0d900c07946a47659abbc441fd91e4501b39 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Wed, 18 Jun 2025 22:10:27 -0400
Subject: [PATCH 078/208] fixed couple emerging bugs in the JAX backend

---
 xlb/helper/nse_fields.py                                  | 8 +++++++-
 .../boundary_condition/bc_extrapolation_outflow.py        | 2 +-
 xlb/operator/stepper/nse_multires_stepper.py              | 1 +
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/xlb/helper/nse_fields.py b/xlb/helper/nse_fields.py
index c56eb07c..c513f735 100644
--- a/xlb/helper/nse_fields.py
+++ b/xlb/helper/nse_fields.py
@@ -1,6 +1,7 @@
 from xlb import DefaultConfig
 from xlb.grid import grid_factory
 from xlb.precision_policy import Precision
+from xlb.compute_backend import ComputeBackend
 from typing import Tuple
 
 
@@ -35,7 +36,12 @@ def create_nse_fields(
     # Create fields
     f_0 = grid.create_field(cardinality=velocity_set.q, dtype=precision_policy.store_precision)
     f_1 = grid.create_field(cardinality=velocity_set.q, dtype=precision_policy.store_precision)
-    missing_mask = grid.create_field(cardinality=velocity_set.q, dtype=Precision.UINT8)
     bc_mask = grid.create_field(cardinality=1, dtype=Precision.UINT8)
+    if compute_backend in [ComputeBackend.WARP, ComputeBackend.NEON]:
+        # For WARP and NEON, we use UINT8 for missing mask
+        missing_mask = grid.create_field(cardinality=velocity_set.q, dtype=Precision.UINT8)
+    else:
+        # For JAX, we use bool for missing mask
+        missing_mask = grid.create_field(cardinality=velocity_set.q, dtype=Precision.BOOL)
 
     return grid, f_0, f_1, missing_mask, bc_mask
diff --git a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
index 57bf78a1..4574f3a0 100644
--- a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
+++ b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
@@ -57,7 +57,7 @@ def __init__(
         )
 
         # find and store the normal vector using indices
-        if compute_backend == ComputeBackend.JAX:
+        if self.compute_backend == ComputeBackend.JAX:
             self._get_normal_vectors(indices)
 
         # Unpack the two warp functionals needed for this BC!
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 2309fd14..6b0fe216 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -28,6 +28,7 @@
     MultiresMeshMaskerAABB,
 )
 
+
 class MultiresIncompressibleNavierStokesStepper(Stepper):
     def __init__(
         self,

From fbcc546fcf73db458350953b077a43ac7c400de3 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Thu, 19 Jun 2025 11:26:10 -0400
Subject: [PATCH 079/208] Moved padding to BC class and improved the logic of
 indices masker in JAX

---
 examples/cfd/flow_past_sphere_3d.py           |  2 +-
 .../boundary_condition/boundary_condition.py  | 18 +++-
 .../indices_boundary_masker.py                | 84 ++++++++++---------
 xlb/operator/stepper/nse_multires_stepper.py  |  7 +-
 xlb/operator/stepper/nse_stepper.py           |  1 -
 5 files changed, 64 insertions(+), 48 deletions(-)

diff --git a/examples/cfd/flow_past_sphere_3d.py b/examples/cfd/flow_past_sphere_3d.py
index 2872d7fe..12a2c9e1 100644
--- a/examples/cfd/flow_past_sphere_3d.py
+++ b/examples/cfd/flow_past_sphere_3d.py
@@ -51,7 +51,7 @@
 z = np.arange(grid_shape[2])
 X, Y, Z = np.meshgrid(x, y, z, indexing="ij")
 indices = np.where((X - grid_shape[0] // 6) ** 2 + (Y - grid_shape[1] // 2) ** 2 + (Z - grid_shape[2] // 2) ** 2 < sphere_radius**2)
-sphere = [tuple(indices[i]) for i in range(velocity_set.d)]
+sphere = [tuple(indices[i].tolist()) for i in range(velocity_set.d)]
 
 
 # Define Boundary Conditions
diff --git a/xlb/operator/boundary_condition/boundary_condition.py b/xlb/operator/boundary_condition/boundary_condition.py
index 14bf7033..b996ceaf 100644
--- a/xlb/operator/boundary_condition/boundary_condition.py
+++ b/xlb/operator/boundary_condition/boundary_condition.py
@@ -7,8 +7,7 @@
 from typing import Any
 from jax import jit
 from functools import partial
-import jax
-import jax.numpy as jnp
+import numpy as np
 
 from xlb.velocity_set.velocity_set import VelocitySet
 from xlb.precision_policy import PrecisionPolicy
@@ -100,6 +99,21 @@ def assemble_auxiliary_data(
         if self.compute_backend == ComputeBackend.WARP:
             self.assemble_auxiliary_data = assemble_auxiliary_data
 
+    def pad_indices(self):
+        """
+        This method pads the indices to ensure that the boundary condition can be applied correctly.
+        It is used to find missing directions in indices_boundary_masker when the BC is imposed on a
+        geometry that is in the domain interior.
+        """
+        _d = self.velocity_set.d
+        bc_indices = np.array(self.indices)
+
+        if self.needs_padding:
+            bc_indices_padded = bc_indices[:, :, None] + self.velocity_set.c[:, None, :]
+            return np.unique(bc_indices_padded.reshape(_d, -1), axis=1)
+        else:
+            return bc_indices
+
     @partial(jit, static_argnums=(0,), inline=True)
     def assemble_auxiliary_data(self, f_pre, f_post, bc_mask, missing_mask):
         """
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 81035289..4e842e93 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -38,38 +38,36 @@ def are_indices_in_interior(self, indices, shape):
         :param shape: Tuple representing the shape of the domain (nx, ny) for 2D or (nx, ny, nz) for 3D.
         :return: Array of boolean flags where each flag indicates whether the corresponding index is inside the bounds.
         """
-        d = self.velocity_set.d
+        _d = self.velocity_set.d
         shape_array = np.array(shape)
-        return np.all((indices[:d] > 0) & (indices[:d] < shape_array[:d, np.newaxis] - 1), axis=0)
+        return np.all((indices[:_d] > 0) & (indices[:_d] < shape_array[:_d, np.newaxis] - 1), axis=0)
 
     @Operator.register_backend(ComputeBackend.JAX)
     # TODO HS: figure out why uncommenting the line below fails unlike other operators!
     # @partial(jit, static_argnums=(0))
     def jax_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
-        # Pad the missing mask to create a grid mask to identify out of bound boundaries
-        # Set padded regin to True (i.e. boundary)
+        # Extend the missing mask by padding to identify out of bound boundaries
+        # Set padded region to True (i.e. boundary)
         dim = missing_mask.ndim - 1
+        grid_shape = bc_mask[0].shape
         nDevices = jax.device_count()
         pad_x, pad_y, pad_z = nDevices, 1, 1
-        # TODO MEHDI: There is sometimes a halting problem here when padding is used in a multi-GPU setting since we're not jitting this function.
-        # For now, we compute the bmap on GPU zero.
-        if dim == 2:
-            bmap = jnp.zeros((pad_x * 2 + bc_mask[0].shape[0], pad_y * 2 + bc_mask[0].shape[1]), dtype=jnp.uint8)
-            bmap = bmap.at[pad_x:-pad_x, pad_y:-pad_y].set(bc_mask[0])
-            grid_mask = jnp.pad(missing_mask, ((0, 0), (pad_x, pad_x), (pad_y, pad_y)), constant_values=True)
-            # bmap = jnp.pad(bc_mask[0], ((pad_x, pad_x), (pad_y, pad_y)), constant_values=0)
-        if dim == 3:
-            bmap = jnp.zeros((pad_x * 2 + bc_mask[0].shape[0], pad_y * 2 + bc_mask[0].shape[1], pad_z * 2 + bc_mask[0].shape[2]), dtype=jnp.uint8)
-            bmap = bmap.at[pad_x:-pad_x, pad_y:-pad_y, pad_z:-pad_z].set(bc_mask[0])
-            grid_mask = jnp.pad(missing_mask, ((0, 0), (pad_x, pad_x), (pad_y, pad_y), (pad_z, pad_z)), constant_values=True)
-            # bmap = jnp.pad(bc_mask[0], ((pad_x, pad_x), (pad_y, pad_y), (pad_z, pad_z)), constant_values=0)
 
-        # shift indices
-        shift_tup = (pad_x, pad_y) if dim == 2 else (pad_x, pad_y, pad_z)
+        # Shift indices due to padding
+        shift = np.array((pad_x, pad_y) if dim == 2 else (pad_x, pad_y, pad_z))[:, np.newaxis]
         if start_index is None:
             start_index = (0,) * dim
 
-        domain_shape = bc_mask[0].shape
+        # TODO MEHDI: There is sometimes a halting problem here when padding is used in a multi-GPU setting since we're not jitting this function.
+        # For now, we compute the bc_mask_extended on GPU zero.
+        if dim == 2:
+            bc_mask_extended = jnp.pad(bc_mask[0], ((pad_x, pad_x), (pad_y, pad_y)), constant_values=0)
+            missing_mask_extended = jnp.pad(missing_mask, ((0, 0), (pad_x, pad_x), (pad_y, pad_y)), constant_values=True)
+        if dim == 3:
+            bc_mask_extended = jnp.pad(bc_mask[0], ((pad_x, pad_x), (pad_y, pad_y), (pad_z, pad_z)), constant_values=0)
+            missing_mask_extended = jnp.pad(missing_mask, ((0, 0), (pad_x, pad_x), (pad_y, pad_y), (pad_z, pad_z)), constant_values=True)
+
+        # Iterate over boundary conditions and set the mask
         for bc in bclist:
             assert bc.indices is not None, f"Please specify indices associated with the {bc.__class__.__name__} BC!"
             assert bc.mesh_vertices is None, (
@@ -77,32 +75,40 @@ def jax_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
             )
             id_number = bc.id
             bc_indices = np.array(bc.indices)
-            local_indices = bc_indices - np.array(start_index)[:, np.newaxis]
-            padded_indices = local_indices + np.array(shift_tup)[:, np.newaxis]
-            bmap = bmap.at[tuple(padded_indices)].set(id_number)
-            if any(self.are_indices_in_interior(bc_indices, domain_shape)) and bc.needs_padding:
-                # checking if all indices associated with this BC are in the interior of the domain.
-                # This flag is needed e.g. if the no-slip geometry is anywhere but at the boundaries of the computational domain.
-                if dim == 2:
-                    grid_mask = grid_mask.at[:, padded_indices[0], padded_indices[1]].set(True)
-                if dim == 3:
-                    grid_mask = grid_mask.at[:, padded_indices[0], padded_indices[1], padded_indices[2]].set(True)
-
-                # Assign the boundary id to the push indices
-                push_indices = padded_indices[:, :, None] + self.velocity_set.c[:, None, :]
-                push_indices = push_indices.reshape(dim, -1)
-                bmap = bmap.at[tuple(push_indices)].set(id_number)
+            indices_origin = np.array(start_index)[:, np.newaxis]
+            if any(self.are_indices_in_interior(bc_indices, grid_shape)):
+                # If the indices are in the interior, we assume the usre specified indices are solid indices
+                solid_indices = bc_indices - indices_origin
+                solid_indices_shifted = solid_indices + shift
+
+                # We obtain the boundary indices by padding the solid indices in all lattice directions
+                indices_padded = bc.pad_indices() - indices_origin
+                indices_shifted = indices_padded + shift
+
+                # The missing mask is set to True meaning (exterior or solid nodes) using the original indices.
+                # This is because of the following streaming step which will assign missing directions for the boundary nodes.
+                missing_mask_extended = missing_mask_extended.at[:, solid_indices_shifted[0], solid_indices_shifted[1], solid_indices_shifted[2]].set(
+                    True
+                )
+            else:
+                indices_shifted = bc_indices - indices_origin + shift
+
+            # Assign the boundary id to the shifted (and possibly padded) indices
+            bc_mask_extended = bc_mask_extended.at[tuple(indices_shifted)].set(id_number)
 
             # We are done with bc.indices. Remove them from BC objects
             bc.__dict__.pop("indices", None)
 
-        grid_mask = self.stream(grid_mask)
+        # Stream the missing mask to identify missing directions
+        missing_mask_extended = self.stream(missing_mask_extended)
+
+        # Crop the extended masks to remove padding
         if dim == 2:
-            missing_mask = grid_mask[:, pad_x:-pad_x, pad_y:-pad_y]
-            bc_mask = bc_mask.at[0].set(bmap[pad_x:-pad_x, pad_y:-pad_y])
+            missing_mask = missing_mask_extended[:, pad_x:-pad_x, pad_y:-pad_y]
+            bc_mask = bc_mask.at[0].set(bc_mask_extended[pad_x:-pad_x, pad_y:-pad_y])
         if dim == 3:
-            missing_mask = grid_mask[:, pad_x:-pad_x, pad_y:-pad_y, pad_z:-pad_z]
-            bc_mask = bc_mask.at[0].set(bmap[pad_x:-pad_x, pad_y:-pad_y, pad_z:-pad_z])
+            missing_mask = missing_mask_extended[:, pad_x:-pad_x, pad_y:-pad_y, pad_z:-pad_z]
+            bc_mask = bc_mask.at[0].set(bc_mask_extended[pad_x:-pad_x, pad_y:-pad_y, pad_z:-pad_z])
         return bc_mask, missing_mask
 
     def _construct_warp(self):
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 6b0fe216..67f84561 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -1,9 +1,5 @@
-# Base class for all stepper operators
+# Base class for all multires stepper operators
 
-from functools import partial
-
-from docutils.nodes import container
-from jax import jit
 import warp as wp
 import neon
 from typing import Any
@@ -95,6 +91,7 @@ def prepare_fields(self, rho, u, initializer=None):
         f_0.update_host(0)
         wp.synchronize()
         bc_mask.export_vti("bc_mask.vti", "bc_mask")
+        exit(0)
         # f_0.export_vti("init_f0.vti", 'init_f0')
         # missing_mask.export_vti("missing_mask.vti", 'missing_mask')
 
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index a768c306..5e16f680 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -2,7 +2,6 @@
 
 from functools import partial
 
-from docutils.nodes import container
 from jax import jit
 import warp as wp
 import neon

From 3d6e2053e0e066c3f3a1389acaaf78f38dbe11fb Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Thu, 19 Jun 2025 15:36:01 -0400
Subject: [PATCH 080/208] Changed indices masker WARP to rely on local write
 instead of remote write.  Interior treatment is now consistent with JAX.

---
 .../boundary_condition/boundary_condition.py  |   4 +-
 .../indices_boundary_masker.py                | 127 ++++++++++++++----
 2 files changed, 105 insertions(+), 26 deletions(-)

diff --git a/xlb/operator/boundary_condition/boundary_condition.py b/xlb/operator/boundary_condition/boundary_condition.py
index b996ceaf..be780341 100644
--- a/xlb/operator/boundary_condition/boundary_condition.py
+++ b/xlb/operator/boundary_condition/boundary_condition.py
@@ -107,9 +107,9 @@ def pad_indices(self):
         """
         _d = self.velocity_set.d
         bc_indices = np.array(self.indices)
-
+        lattice_velocity_np = self.velocity_set._c
         if self.needs_padding:
-            bc_indices_padded = bc_indices[:, :, None] + self.velocity_set.c[:, None, :]
+            bc_indices_padded = bc_indices[:, :, None] + lattice_velocity_np[:, None, :]
             return np.unique(bc_indices_padded.reshape(_d, -1), axis=1)
         else:
             return bc_indices
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 4e842e93..c72fbfa7 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -4,6 +4,7 @@
 import jax.numpy as jnp
 import numpy as np
 import warp as wp
+import copy
 
 from xlb.compute_backend import ComputeBackend
 from xlb.grid import grid_factory
@@ -122,10 +123,10 @@ def is_in_bounds(index: wp.vec3i, shape: wp.vec3i):
 
         # Construct the warp 3D kernel
         @wp.kernel
-        def kernel(
+        def kernel_domain_bounds(
             indices: wp.array2d(dtype=wp.int32),
             id_number: wp.array1d(dtype=wp.uint8),
-            is_interior: wp.array1d(dtype=wp.bool),
+            is_interior: wp.array1d(dtype=wp.uint8),
             bc_mask: wp.array4d(dtype=wp.uint8),
             missing_mask: wp.array4d(dtype=wp.uint8),
         ):
@@ -138,42 +139,92 @@ def kernel(
             index[1] = indices[1, ii]
             index[2] = indices[2, ii]
 
+            if is_interior[ii] == wp.uint8(True):
+                # If the index is in the interior, we set that index to be a solid node (identified by 255)
+                # This information will be used in the next kernel to identify missing directions using the
+                # padded indices of the solid node that are associated with the boundary condition.
+                bc_mask[0, index[0], index[1], index[2]] = wp.uint8(255)
+                return
+
             # Check if index is in bounds
             shape = wp.vec3i(missing_mask.shape[1], missing_mask.shape[2], missing_mask.shape[3])
             if is_in_bounds(index, shape):
+                # Set bc_mask for all bc indices
+                bc_mask[0, index[0], index[1], index[2]] = id_number[ii]
+
                 # Stream indices
                 for l in range(_q):
                     # Get the index of the streaming direction
                     pull_index = wp.vec3i()
-                    push_index = wp.vec3i()
                     for d in range(self.velocity_set.d):
                         pull_index[d] = index[d] - _c[d, l]
-                        push_index[d] = index[d] + _c[d, l]
 
-                    # set bc_mask for all bc indices
-                    bc_mask[0, index[0], index[1], index[2]] = id_number[ii]
-
-                    # check if pull index is out of bound
+                    # Check if pull index is out of bound
                     # These directions will have missing information after streaming
                     if not is_in_bounds(pull_index, shape):
                         # Set the missing mask
                         missing_mask[l, index[0], index[1], index[2]] = wp.uint8(True)
 
-                    # handling geometries in the interior of the computational domain
-                    elif is_in_bounds(pull_index, shape) and is_interior[ii]:
-                        # Set the missing mask
-                        missing_mask[l, push_index[0], push_index[1], push_index[2]] = wp.uint8(True)
-                        bc_mask[0, push_index[0], push_index[1], push_index[2]] = id_number[ii]
+        @wp.kernel
+        def kernel_interior_bc_mask(
+            indices: wp.array2d(dtype=wp.int32),
+            id_number: wp.array1d(dtype=wp.uint8),
+            bc_mask: wp.array4d(dtype=wp.uint8),
+        ):
+            # Get the index of indices
+            ii = wp.tid()
+
+            # Get local indices
+            index = wp.vec3i()
+            index[0] = indices[0, ii]
+            index[1] = indices[1, ii]
+            index[2] = indices[2, ii]
+
+            # Set bc_mask for all interior bc indices
+            bc_mask[0, index[0], index[1], index[2]] = id_number[ii]
+
+        @wp.kernel
+        def kernel_interior_missing_mask(
+            indices: wp.array2d(dtype=wp.int32),
+            bc_mask: wp.array4d(dtype=wp.uint8),
+            missing_mask: wp.array4d(dtype=wp.uint8),
+        ):
+            # Get the index of indices
+            ii = wp.tid()
+
+            # Get local indices
+            index = wp.vec3i()
+            index[0] = indices[0, ii]
+            index[1] = indices[1, ii]
+            index[2] = indices[2, ii]
 
-        return None, kernel
+            shape = wp.vec3i(missing_mask.shape[1], missing_mask.shape[2], missing_mask.shape[3])
+            for l in range(_q):
+                # Get the index of the streaming direction
+                pull_index = wp.vec3i()
+                for d in range(self.velocity_set.d):
+                    pull_index[d] = index[d] - _c[d, l]
+
+                # Check if pull index is a fluid node (bc_mask is zero for fluid nodes)
+                if is_in_bounds(pull_index, shape) and bc_mask[0, pull_index[0], pull_index[1], pull_index[2]] == wp.uint8(255):
+                    missing_mask[l, index[0], index[1], index[2]] = wp.uint8(True)
+
+        kernel_dic = {
+            "kernel_domain_bounds": kernel_domain_bounds,
+            "kernel_interior_bc_mask": kernel_interior_bc_mask,
+            "kernel_interior_missing_mask": kernel_interior_missing_mask,
+        }
+        return None, kernel_dic
 
     # a helper for this operator
-    def _prepare_kernel_inputs(self, bclist, bc_mask):
+    def _prepare_kernel_inputs(self, bclist, grid_shape):
         # Pre-allocate arrays with maximum possible size
-        max_size = sum(len(bc.indices[0]) if isinstance(bc.indices, list) else bc.indices.shape[1] for bc in bclist if bc.indices is not None)
+        max_size = sum(
+            len(bc.indices[0]) if isinstance(bc.indices, (list, tuple)) else bc.indices.shape[1] for bc in bclist if bc.indices is not None
+        )
         indices = np.zeros((3, max_size), dtype=np.int32)
         id_numbers = np.zeros(max_size, dtype=np.uint8)
-        is_interior = np.zeros(max_size, dtype=bool)
+        is_interior = np.zeros(max_size, dtype=np.uint8)
 
         current_index = 0
         for bc in bclist:
@@ -195,10 +246,7 @@ def _prepare_kernel_inputs(self, bclist, bc_mask):
             id_numbers[current_index : current_index + num_indices] = bc.id
 
             # Set is_interior flags
-            if bc.needs_padding:
-                is_interior[current_index : current_index + num_indices] = self.are_indices_in_interior(bc_indices, bc_mask[0].shape)
-            else:
-                is_interior[current_index : current_index + num_indices] = False
+            is_interior[current_index : current_index + num_indices] = self.are_indices_in_interior(bc_indices, grid_shape)
 
             current_index += num_indices
 
@@ -214,17 +262,26 @@ def _prepare_kernel_inputs(self, bclist, bc_mask):
         # Convert to Warp arrays
         wp_indices = wp.array(indices, dtype=wp.int32)
         wp_id_numbers = wp.array(id_numbers, dtype=wp.uint8)
-        wp_is_interior = wp.array(is_interior, dtype=wp.bool)
+        wp_is_interior = wp.array(is_interior, dtype=wp.uint8)
         return total_index, wp_indices, wp_id_numbers, wp_is_interior
 
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
         # prepare warp kernel inputs
-        total_index, wp_indices, wp_id_numbers, wp_is_interior = self._prepare_kernel_inputs(bclist, bc_mask)
+        bc_interior = []
+        grid_shape = bc_mask[0].shape
+        for bc in bclist:
+            if any(self.are_indices_in_interior(np.array(bc.indices), grid_shape)):
+                bc_copy = copy.copy(bc)  # shallow copy of the whole object
+                bc_copy.indices = copy.deepcopy(bc.pad_indices())  # deep copy only the modified part
+                bc_interior.append(bc_copy)
+
+        # Prepare the first kernel inputs for all items in boundary condition list
+        total_index, wp_indices, wp_id_numbers, wp_is_interior = self._prepare_kernel_inputs(bclist, grid_shape)
 
         # Launch the warp kernel
         wp.launch(
-            self.warp_kernel,
+            self.warp_kernel["kernel_domain_bounds"],
             dim=total_index,
             inputs=[
                 wp_indices,
@@ -234,6 +291,28 @@ def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
                 missing_mask,
             ],
         )
+        # Prepare the second and third kernel inputs for only a subset of boundary conditions associated with the interior
+        # Note 1: launching order of the following kernels are important here!
+        # Note 2: Due to race conditioning, the two kernels cannot be fused together.
+        total_index, wp_indices, wp_id_numbers, _ = self._prepare_kernel_inputs(bc_interior, grid_shape)
+        wp.launch(
+            self.warp_kernel["kernel_interior_missing_mask"],
+            dim=total_index,
+            inputs=[
+                wp_indices,
+                bc_mask,
+                missing_mask,
+            ],
+        )
+        wp.launch(
+            self.warp_kernel["kernel_interior_bc_mask"],
+            dim=total_index,
+            inputs=[
+                wp_indices,
+                wp_id_numbers,
+                bc_mask,
+            ],
+        )
 
         return bc_mask, missing_mask
 

From b10d48107d5e7700779e87a92df90dbeccea4c08 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Thu, 19 Jun 2025 15:57:03 -0400
Subject: [PATCH 081/208] switched to generic read and write methods in indices
 masker to support both warp and neon

---
 .../boundary_masker/indices_boundary_masker.py    | 15 ++++++++-------
 .../boundary_masker/mesh_boundary_masker.py       | 11 -----------
 xlb/operator/operator.py                          | 13 +++++++++++++
 3 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index c72fbfa7..cc06075e 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -143,14 +143,14 @@ def kernel_domain_bounds(
                 # If the index is in the interior, we set that index to be a solid node (identified by 255)
                 # This information will be used in the next kernel to identify missing directions using the
                 # padded indices of the solid node that are associated with the boundary condition.
-                bc_mask[0, index[0], index[1], index[2]] = wp.uint8(255)
+                self.write_field(bc_mask, index, 0, wp.uint8(255))
                 return
 
             # Check if index is in bounds
             shape = wp.vec3i(missing_mask.shape[1], missing_mask.shape[2], missing_mask.shape[3])
             if is_in_bounds(index, shape):
                 # Set bc_mask for all bc indices
-                bc_mask[0, index[0], index[1], index[2]] = id_number[ii]
+                self.write_field(bc_mask, index, 0, wp.uint8(id_number[ii]))
 
                 # Stream indices
                 for l in range(_q):
@@ -163,7 +163,7 @@ def kernel_domain_bounds(
                     # These directions will have missing information after streaming
                     if not is_in_bounds(pull_index, shape):
                         # Set the missing mask
-                        missing_mask[l, index[0], index[1], index[2]] = wp.uint8(True)
+                        self.write_field(missing_mask, index, l, wp.uint8(True))
 
         @wp.kernel
         def kernel_interior_bc_mask(
@@ -181,7 +181,8 @@ def kernel_interior_bc_mask(
             index[2] = indices[2, ii]
 
             # Set bc_mask for all interior bc indices
-            bc_mask[0, index[0], index[1], index[2]] = id_number[ii]
+            self.write_field(bc_mask, index, 0, wp.uint8(id_number[ii]))
+            return
 
         @wp.kernel
         def kernel_interior_missing_mask(
@@ -206,8 +207,8 @@ def kernel_interior_missing_mask(
                     pull_index[d] = index[d] - _c[d, l]
 
                 # Check if pull index is a fluid node (bc_mask is zero for fluid nodes)
-                if is_in_bounds(pull_index, shape) and bc_mask[0, pull_index[0], pull_index[1], pull_index[2]] == wp.uint8(255):
-                    missing_mask[l, index[0], index[1], index[2]] = wp.uint8(True)
+                if is_in_bounds(pull_index, shape) and self.read_field(bc_mask, pull_index, 0) == wp.uint8(255):
+                    self.write_field(missing_mask, index, l, wp.uint8(True))
 
         kernel_dic = {
             "kernel_domain_bounds": kernel_domain_bounds,
@@ -269,7 +270,7 @@ def _prepare_kernel_inputs(self, bclist, grid_shape):
     def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
         # prepare warp kernel inputs
         bc_interior = []
-        grid_shape = bc_mask[0].shape
+        grid_shape = self.get_grid_shape(bc_mask)
         for bc in bclist:
             if any(self.are_indices_in_interior(np.array(bc.indices), grid_shape)):
                 bc_copy = copy.copy(bc)  # shallow copy of the whole object
diff --git a/xlb/operator/boundary_masker/mesh_boundary_masker.py b/xlb/operator/boundary_masker/mesh_boundary_masker.py
index b2740631..067ab323 100644
--- a/xlb/operator/boundary_masker/mesh_boundary_masker.py
+++ b/xlb/operator/boundary_masker/mesh_boundary_masker.py
@@ -206,17 +206,6 @@ def resolve_out_of_bound_kernel(
         self.mesh_voxel_intersect = mesh_voxel_intersect
         self.resolve_out_of_bound_kernel = resolve_out_of_bound_kernel
 
-    def get_grid_shape(self, bc_mask):
-        """
-        Get the grid shape from the boundary mask.
-        """
-        if self.compute_backend == ComputeBackend.WARP:
-            return bc_mask.shape[1:]
-        elif self.compute_backend == ComputeBackend.NEON:
-            return bc_mask.get_grid().dim.x, bc_mask.get_grid().dim.y, bc_mask.get_grid().dim.z
-        else:
-            raise ValueError(f"Unsupported compute backend: {self.compute_backend}")
-
     def _prepare_kernel_inputs(
         self,
         bc,
diff --git a/xlb/operator/operator.py b/xlb/operator/operator.py
index 30a06c32..12127652 100644
--- a/xlb/operator/operator.py
+++ b/xlb/operator/operator.py
@@ -214,3 +214,16 @@ def write_field(
             raise ValueError(f"Unsupported compute backend: {self.compute_backend}")
 
         return read_field, write_field
+
+    def get_grid_shape(self, bc_mask):
+        """
+        Get the grid shape from the boundary mask.
+        """
+        if self.compute_backend == ComputeBackend.JAX:
+            return bc_mask.shape[1:]
+        elif self.compute_backend == ComputeBackend.WARP:
+            return bc_mask.shape[1:]
+        elif self.compute_backend == ComputeBackend.NEON:
+            return bc_mask.get_grid().dim.x, bc_mask.get_grid().dim.y, bc_mask.get_grid().dim.z
+        else:
+            raise ValueError(f"Unsupported compute backend: {self.compute_backend}")

From 07a0be60058aa5653a3c5b9eb7e12a80c0305618 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Thu, 19 Jun 2025 17:34:23 -0400
Subject: [PATCH 082/208] Added and used a helper function for boundary maskers

---
 xlb/operator/boundary_masker/__init__.py      |   1 +
 xlb/operator/boundary_masker/aabb.py          |   2 +-
 .../helper_functions_masker.py                | 109 ++++++++++++++++++
 .../indices_boundary_masker.py                |  35 +++---
 .../boundary_masker/mesh_boundary_masker.py   |  59 +++-------
 xlb/operator/operator.py                      |  13 ---
 6 files changed, 144 insertions(+), 75 deletions(-)
 create mode 100644 xlb/operator/boundary_masker/helper_functions_masker.py

diff --git a/xlb/operator/boundary_masker/__init__.py b/xlb/operator/boundary_masker/__init__.py
index b3443cc7..fc1f2b45 100644
--- a/xlb/operator/boundary_masker/__init__.py
+++ b/xlb/operator/boundary_masker/__init__.py
@@ -1,3 +1,4 @@
+from xlb.operator.boundary_masker.helper_functions_masker import HelperFunctionsMasker
 from xlb.operator.boundary_masker.indices_boundary_masker import IndicesBoundaryMasker
 from xlb.operator.boundary_masker.mesh_boundary_masker import MeshBoundaryMasker
 from xlb.operator.boundary_masker.aabb import MeshMaskerAABB
diff --git a/xlb/operator/boundary_masker/aabb.py b/xlb/operator/boundary_masker/aabb.py
index 7012e97e..375fbd76 100644
--- a/xlb/operator/boundary_masker/aabb.py
+++ b/xlb/operator/boundary_masker/aabb.py
@@ -46,7 +46,7 @@ def functional(
             needs_mesh_distance: Any,
         ):
             # position of the point
-            cell_center_pos = self.index_to_position(bc_mask, index)
+            cell_center_pos = self.helper_masker.index_to_position(bc_mask, index)
             HALF_VOXEL = wp.vec3(0.5, 0.5, 0.5)
 
             if self.read_field(bc_mask, index, 0) == wp.uint8(255) or self.mesh_voxel_intersect(mesh_id=mesh_id, low=cell_center_pos - HALF_VOXEL):
diff --git a/xlb/operator/boundary_masker/helper_functions_masker.py b/xlb/operator/boundary_masker/helper_functions_masker.py
new file mode 100644
index 00000000..d9e8c98f
--- /dev/null
+++ b/xlb/operator/boundary_masker/helper_functions_masker.py
@@ -0,0 +1,109 @@
+import warp as wp
+from typing import Any
+from xlb import DefaultConfig, ComputeBackend
+
+
+class HelperFunctionsMasker(object):
+    """
+    A collection of helper functions used for the boundary masker operators.
+    """
+
+    def __init__(self, velocity_set=None, precision_policy=None, compute_backend=None):
+        if compute_backend == ComputeBackend.JAX:
+            raise ValueError("This helper class contains helper functions only for the WARP implementation of some BCs not JAX!")
+
+        # Set the default values from the global config
+        self.velocity_set = velocity_set or DefaultConfig.velocity_set
+        self.precision_policy = precision_policy or DefaultConfig.default_precision_policy
+        self.compute_backend = compute_backend or DefaultConfig.default_backend
+
+        # Set local constants
+        _d = self.velocity_set.d
+        _c = self.velocity_set.c
+
+        @wp.func
+        def neon_index_to_warp(neon_field_hdl: Any, index: Any):
+            # Unpack the global index in Neon
+            cIdx = wp.neon_global_idx(neon_field_hdl, index)
+            gx = wp.neon_get_x(cIdx)
+            gy = wp.neon_get_y(cIdx)
+            gz = wp.neon_get_z(cIdx)
+
+            # TODO@Max - XLB is flattening the z dimension in 3D, while neon uses the y dimension
+            if _d == 2:
+                gy, gz = gz, gy
+
+            # Get warp indices
+            index_wp = wp.vec3i(gx, gy, gz)
+            return index_wp
+
+        @wp.func
+        def index_to_position_warp(field: Any, index: wp.vec3i):
+            # position of the point
+            ijk = wp.vec3(wp.float32(index[0]), wp.float32(index[1]), wp.float32(index[2]))
+            pos = ijk + wp.vec3(0.5, 0.5, 0.5)  # cell center
+            return pos
+
+        @wp.func
+        def index_to_position_neon(field: Any, index: Any):
+            # position of the point
+            index_wp = neon_index_to_warp(field, index)
+            return index_to_position_warp(field, index_wp)
+
+        @wp.func
+        def is_in_bounds_warp(index: wp.vec3i, field: wp.array4d(dtype=wp.uint8)):
+            grid_shape = wp.vec3i(field.shape[1], field.shape[2], field.shape[3])
+            return (
+                index[0] >= 0
+                and index[0] < grid_shape[0]
+                and index[1] >= 0
+                and index[1] < grid_shape[1]
+                and index[2] >= 0
+                and index[2] < grid_shape[2]
+            )
+
+        @wp.func
+        def is_in_bounds_neon(field: Any, index: Any):
+            grid_shape = get_grid_shape_neon(field)
+            index_wp = neon_index_to_warp(field, index)
+            return is_in_bounds_warp(index_wp, grid_shape)
+
+        @wp.func
+        def get_grid_shape_neon(bc_mask: wp.array4d(dtype=wp.uint8)):
+            return bc_mask.get_grid().dim.x, bc_mask.get_grid().dim.y, bc_mask.get_grid().dim.z
+
+        @wp.func
+        def get_pull_index_warp(
+            lattice_dir: wp.int32,
+            index: wp.vec3i,
+        ):
+            pull_index = wp.vec3i()
+            for d in range(self.velocity_set.d):
+                pull_index[d] = index[d] - _c[d, lattice_dir]
+
+            return pull_index
+
+        @wp.func
+        def get_pull_index_neon(
+            lattice_dir: wp.int32,
+            index: wp.vec3i,
+        ):
+            # TODO: this seems incorrect to me.
+            pull_index = wp.neon_ngh_idx(wp.int8(-_c[0, lattice_dir]), wp.int8(-_c[1, lattice_dir]), wp.int8(-_c[2, lattice_dir]))
+            return pull_index
+
+        # Construct some helper warp functions
+        self.index_to_position = index_to_position_warp if self.compute_backend == ComputeBackend.WARP else index_to_position_neon
+        self.is_in_bounds = is_in_bounds_warp if self.compute_backend == ComputeBackend.WARP else is_in_bounds_neon
+        self.get_pull_index = get_pull_index_warp if self.compute_backend == ComputeBackend.WARP else get_pull_index_neon
+
+    def get_grid_shape(self, bc_mask):
+        """
+        Get the grid shape from the boundary mask.
+        """
+        if self.compute_backend == ComputeBackend.WARP:
+            return bc_mask.shape[1:]
+        elif self.compute_backend == ComputeBackend.NEON:
+            return self.get_grid_shape_neon(bc_mask)
+        else:
+            raise ValueError(f"Unsupported compute backend: {self.compute_backend}")
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index cc06075e..c759eefe 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -1,16 +1,18 @@
 from typing import Any
+import copy
 
 import jax
 import jax.numpy as jnp
 import numpy as np
 import warp as wp
-import copy
 
 from xlb.compute_backend import ComputeBackend
 from xlb.grid import grid_factory
 from xlb.operator.operator import Operator
 from xlb.operator.stream.stream import Stream
 from xlb.precision_policy import Precision
+from xlb.operator.boundary_masker.helper_functions_masker import HelperFunctionsMasker
+import neon
 
 
 class IndicesBoundaryMasker(Operator):
@@ -30,6 +32,14 @@ def __init__(
         # Call super
         super().__init__(velocity_set, precision_policy, compute_backend)
 
+        if self.compute_backend in [ComputeBackend.WARP, ComputeBackend.NEON]:
+            # Define masker helper functions
+            self.helper_masker = HelperFunctionsMasker(
+                velocity_set=self.velocity_set,
+                precision_policy=self.precision_policy,
+                compute_backend=self.compute_backend,
+            )
+
     def are_indices_in_interior(self, indices, shape):
         """
         Check if each 2D or 3D index is inside the bounds of the domain with the given shape and not
@@ -115,11 +125,7 @@ def jax_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
     def _construct_warp(self):
         # Make constants for warp
         _c = self.velocity_set.c
-        _q = wp.constant(self.velocity_set.q)
-
-        @wp.func
-        def is_in_bounds(index: wp.vec3i, shape: wp.vec3i):
-            return index[0] >= 0 and index[0] < shape[0] and index[1] >= 0 and index[1] < shape[1] and index[2] >= 0 and index[2] < shape[2]
+        _q = self.velocity_set.q
 
         # Construct the warp 3D kernel
         @wp.kernel
@@ -147,8 +153,7 @@ def kernel_domain_bounds(
                 return
 
             # Check if index is in bounds
-            shape = wp.vec3i(missing_mask.shape[1], missing_mask.shape[2], missing_mask.shape[3])
-            if is_in_bounds(index, shape):
+            if self.helper_masker.is_in_bounds(index, missing_mask):
                 # Set bc_mask for all bc indices
                 self.write_field(bc_mask, index, 0, wp.uint8(id_number[ii]))
 
@@ -161,7 +166,7 @@ def kernel_domain_bounds(
 
                     # Check if pull index is out of bound
                     # These directions will have missing information after streaming
-                    if not is_in_bounds(pull_index, shape):
+                    if not self.helper_masker.is_in_bounds(pull_index, missing_mask):
                         # Set the missing mask
                         self.write_field(missing_mask, index, l, wp.uint8(True))
 
@@ -199,7 +204,6 @@ def kernel_interior_missing_mask(
             index[1] = indices[1, ii]
             index[2] = indices[2, ii]
 
-            shape = wp.vec3i(missing_mask.shape[1], missing_mask.shape[2], missing_mask.shape[3])
             for l in range(_q):
                 # Get the index of the streaming direction
                 pull_index = wp.vec3i()
@@ -207,7 +211,7 @@ def kernel_interior_missing_mask(
                     pull_index[d] = index[d] - _c[d, l]
 
                 # Check if pull index is a fluid node (bc_mask is zero for fluid nodes)
-                if is_in_bounds(pull_index, shape) and self.read_field(bc_mask, pull_index, 0) == wp.uint8(255):
+                if self.helper_masker.is_in_bounds(pull_index, missing_mask) and self.read_field(bc_mask, pull_index, 0) == wp.uint8(255):
                     self.write_field(missing_mask, index, l, wp.uint8(True))
 
         kernel_dic = {
@@ -217,8 +221,11 @@ def kernel_interior_missing_mask(
         }
         return None, kernel_dic
 
-    # a helper for this operator
     def _prepare_kernel_inputs(self, bclist, grid_shape):
+        """
+        Prepare the inputs for the warp kernel by pre-allocating arrays and filling them with boundary condition information.
+        """
+
         # Pre-allocate arrays with maximum possible size
         max_size = sum(
             len(bc.indices[0]) if isinstance(bc.indices, (list, tuple)) else bc.indices.shape[1] for bc in bclist if bc.indices is not None
@@ -270,7 +277,7 @@ def _prepare_kernel_inputs(self, bclist, grid_shape):
     def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
         # prepare warp kernel inputs
         bc_interior = []
-        grid_shape = self.get_grid_shape(bc_mask)
+        grid_shape = self.helper_masker.get_grid_shape(bc_mask)
         for bc in bclist:
             if any(self.are_indices_in_interior(np.array(bc.indices), grid_shape)):
                 bc_copy = copy.copy(bc)  # shallow copy of the whole object
@@ -319,8 +326,6 @@ def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
-        import neon
-
         # Make constants
         _d = self.velocity_set.d
 
diff --git a/xlb/operator/boundary_masker/mesh_boundary_masker.py b/xlb/operator/boundary_masker/mesh_boundary_masker.py
index 067ab323..fd620afd 100644
--- a/xlb/operator/boundary_masker/mesh_boundary_masker.py
+++ b/xlb/operator/boundary_masker/mesh_boundary_masker.py
@@ -7,6 +7,7 @@
 from xlb.precision_policy import PrecisionPolicy
 from xlb.compute_backend import ComputeBackend
 from xlb.operator.operator import Operator
+from xlb.operator.boundary_masker.helper_functions_masker import HelperFunctionsMasker
 
 
 class MeshBoundaryMasker(Operator):
@@ -35,53 +36,20 @@ def __init__(
         # Make constants for warp
         _c = self.velocity_set.c
         _q = self.velocity_set.q
-        _d = self.velocity_set.d
 
-        @wp.func
-        def neon_index_to_warp(neon_field_hdl: Any, index: Any):
-            # Unpack the global index in Neon
-            cIdx = wp.neon_global_idx(neon_field_hdl, index)
-            gx = wp.neon_get_x(cIdx)
-            gy = wp.neon_get_y(cIdx)
-            gz = wp.neon_get_z(cIdx)
-
-            # TODO@Max - XLB is flattening the z dimension in 3D, while neon uses the y dimension
-            if _d == 2:
-                gy, gz = gz, gy
-
-            # Get warp indices
-            index_wp = wp.vec3i(gx, gy, gz)
-            return index_wp
-
-        @wp.func
-        def index_to_position_warp(field: Any, index: wp.vec3i):
-            # position of the point
-            ijk = wp.vec3(wp.float32(index[0]), wp.float32(index[1]), wp.float32(index[2]))
-            pos = ijk + wp.vec3(0.5, 0.5, 0.5)  # cell center
-            return pos
-
-        @wp.func
-        def index_to_position_neon(field: Any, index: Any):
-            # position of the point
-            index_wp = neon_index_to_warp(field, index)
-            return index_to_position_warp(field, index_wp)
-
-        @wp.func
-        def is_in_bounds(index: wp.vec3i, domain_shape: wp.vec3i):
-            return (
-                index[0] >= 0
-                and index[0] < domain_shape[0]
-                and index[1] >= 0
-                and index[1] < domain_shape[1]
-                and index[2] >= 0
-                and index[2] < domain_shape[2]
+        if self.compute_backend in [ComputeBackend.WARP, ComputeBackend.NEON]:
+            # Define masker helper functions
+            self.helper_masker = HelperFunctionsMasker(
+                velocity_set=self.velocity_set,
+                precision_policy=self.precision_policy,
+                compute_backend=self.compute_backend,
             )
 
         @wp.func
         def out_of_bound_pull_index(
             lattice_dir: wp.int32,
             index: wp.vec3i,
-            domain_shape: wp.vec3i,
+            field: wp.array4d(dtype=wp.uint8),
         ):
             # Get the index of the streaming direction
             pull_index = wp.vec3i()
@@ -90,7 +58,7 @@ def out_of_bound_pull_index(
 
             # check if pull index is out of bound
             # These directions will have missing information after streaming
-            missing = not is_in_bounds(pull_index, domain_shape)
+            missing = not self.helper_masker.is_in_bounds(pull_index, field)
             return missing
 
         # Function to precompute useful values per triangle, assuming spacing is (1,1,1)
@@ -198,11 +166,10 @@ def resolve_out_of_bound_kernel(
             if bc_mask[0, index[0], index[1], index[2]] == wp.uint8(id_number):
                 for l in range(1, _q):
                     # Ensuring out of bound pull indices are properly considered in the missing_mask
-                    if out_of_bound_pull_index(l, index, domain_shape):
+                    if out_of_bound_pull_index(l, index, missing_mask):
                         missing_mask[l, index[0], index[1], index[2]] = wp.uint8(True)
 
         # Construct some helper warp functions
-        self.index_to_position = index_to_position_warp if self.compute_backend == ComputeBackend.WARP else index_to_position_neon
         self.mesh_voxel_intersect = mesh_voxel_intersect
         self.resolve_out_of_bound_kernel = resolve_out_of_bound_kernel
 
@@ -217,14 +184,14 @@ def _prepare_kernel_inputs(
             "Mesh points must be reshaped into an array (N, 3) where N indicates number of points!"
         )
 
-        domain_shape = self.get_grid_shape(bc_mask)  # (nx, ny, nz)
+        grid_shape = self.helper_masker.get_grid_shape(bc_mask)  # (nx, ny, nz)
         mesh_vertices = bc.mesh_vertices
         mesh_min = np.min(mesh_vertices, axis=0)
         mesh_max = np.max(mesh_vertices, axis=0)
 
-        if any(mesh_min < 0) or any(mesh_max >= domain_shape):
+        if any(mesh_min < 0) or any(mesh_max >= grid_shape):
             raise ValueError(
-                f"Mesh extents ({mesh_min}, {mesh_max}) exceed domain dimensions {domain_shape}. The mesh must be fully contained within the domain."
+                f"Mesh extents ({mesh_min}, {mesh_max}) exceed domain dimensions {grid_shape}. The mesh must be fully contained within the domain."
             )
 
         # We are done with bc.mesh_vertices. Remove them from BC objects
diff --git a/xlb/operator/operator.py b/xlb/operator/operator.py
index 12127652..30a06c32 100644
--- a/xlb/operator/operator.py
+++ b/xlb/operator/operator.py
@@ -214,16 +214,3 @@ def write_field(
             raise ValueError(f"Unsupported compute backend: {self.compute_backend}")
 
         return read_field, write_field
-
-    def get_grid_shape(self, bc_mask):
-        """
-        Get the grid shape from the boundary mask.
-        """
-        if self.compute_backend == ComputeBackend.JAX:
-            return bc_mask.shape[1:]
-        elif self.compute_backend == ComputeBackend.WARP:
-            return bc_mask.shape[1:]
-        elif self.compute_backend == ComputeBackend.NEON:
-            return bc_mask.get_grid().dim.x, bc_mask.get_grid().dim.y, bc_mask.get_grid().dim.z
-        else:
-            raise ValueError(f"Unsupported compute backend: {self.compute_backend}")

From 219cfd1ff870977b4a3c60c88f6bafaed16082b2 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Thu, 19 Jun 2025 22:22:17 -0400
Subject: [PATCH 083/208] added read_field_neighbor and further unified neon
 and warp helpers in indices masker

---
 .../helper_functions_masker.py                | 22 ++++++------
 .../indices_boundary_masker.py                | 15 ++++----
 xlb/operator/operator.py                      | 35 +++++++++++++++++++
 3 files changed, 51 insertions(+), 21 deletions(-)

diff --git a/xlb/operator/boundary_masker/helper_functions_masker.py b/xlb/operator/boundary_masker/helper_functions_masker.py
index d9e8c98f..774dd578 100644
--- a/xlb/operator/boundary_masker/helper_functions_masker.py
+++ b/xlb/operator/boundary_masker/helper_functions_masker.py
@@ -51,7 +51,7 @@ def index_to_position_neon(field: Any, index: Any):
             return index_to_position_warp(field, index_wp)
 
         @wp.func
-        def is_in_bounds_warp(index: wp.vec3i, field: wp.array4d(dtype=wp.uint8)):
+        def is_in_bounds(index: wp.vec3i, field: wp.array4d(dtype=wp.uint8)):
             grid_shape = wp.vec3i(field.shape[1], field.shape[2], field.shape[3])
             return (
                 index[0] >= 0
@@ -62,18 +62,13 @@ def is_in_bounds_warp(index: wp.vec3i, field: wp.array4d(dtype=wp.uint8)):
                 and index[2] < grid_shape[2]
             )
 
-        @wp.func
-        def is_in_bounds_neon(field: Any, index: Any):
-            grid_shape = get_grid_shape_neon(field)
-            index_wp = neon_index_to_warp(field, index)
-            return is_in_bounds_warp(index_wp, grid_shape)
-
         @wp.func
         def get_grid_shape_neon(bc_mask: wp.array4d(dtype=wp.uint8)):
             return bc_mask.get_grid().dim.x, bc_mask.get_grid().dim.y, bc_mask.get_grid().dim.z
 
         @wp.func
         def get_pull_index_warp(
+            field: wp.array4d(dtype=wp.uint8),
             lattice_dir: wp.int32,
             index: wp.vec3i,
         ):
@@ -81,20 +76,23 @@ def get_pull_index_warp(
             for d in range(self.velocity_set.d):
                 pull_index[d] = index[d] - _c[d, lattice_dir]
 
-            return pull_index
+            return pull_index, pull_index
 
         @wp.func
         def get_pull_index_neon(
+            field: wp.array4d(dtype=wp.uint8),
             lattice_dir: wp.int32,
             index: wp.vec3i,
         ):
-            # TODO: this seems incorrect to me.
-            pull_index = wp.neon_ngh_idx(wp.int8(-_c[0, lattice_dir]), wp.int8(-_c[1, lattice_dir]), wp.int8(-_c[2, lattice_dir]))
-            return pull_index
+            # Convert the index to warp
+            index_wp = neon_index_to_warp(field, index)
+            pull_index_wp = get_pull_index_warp(field, lattice_dir, index_wp)
+            pull_index_neon = wp.neon_ngh_idx(wp.int8(-_c[0, lattice_dir]), wp.int8(-_c[1, lattice_dir]), wp.int8(-_c[2, lattice_dir]))
+            return pull_index_wp, pull_index_neon
 
         # Construct some helper warp functions
+        self.is_in_bounds = is_in_bounds
         self.index_to_position = index_to_position_warp if self.compute_backend == ComputeBackend.WARP else index_to_position_neon
-        self.is_in_bounds = is_in_bounds_warp if self.compute_backend == ComputeBackend.WARP else is_in_bounds_neon
         self.get_pull_index = get_pull_index_warp if self.compute_backend == ComputeBackend.WARP else get_pull_index_neon
 
     def get_grid_shape(self, bc_mask):
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index c759eefe..0c9a07fb 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -124,7 +124,6 @@ def jax_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
 
     def _construct_warp(self):
         # Make constants for warp
-        _c = self.velocity_set.c
         _q = self.velocity_set.q
 
         # Construct the warp 3D kernel
@@ -159,10 +158,8 @@ def kernel_domain_bounds(
 
                 # Stream indices
                 for l in range(_q):
-                    # Get the index of the streaming direction
-                    pull_index = wp.vec3i()
-                    for d in range(self.velocity_set.d):
-                        pull_index[d] = index[d] - _c[d, l]
+                    # Get the pull index which is the index of the neighboring node where information is pulled from
+                    pull_index, _ = self.helper_masker.get_pull_index(bc_mask, l, index)
 
                     # Check if pull index is out of bound
                     # These directions will have missing information after streaming
@@ -206,12 +203,12 @@ def kernel_interior_missing_mask(
 
             for l in range(_q):
                 # Get the index of the streaming direction
-                pull_index = wp.vec3i()
-                for d in range(self.velocity_set.d):
-                    pull_index[d] = index[d] - _c[d, l]
+                pull_index_data, pull_index_handle = self.helper_masker.get_pull_index(bc_mask, l, index)
 
                 # Check if pull index is a fluid node (bc_mask is zero for fluid nodes)
-                if self.helper_masker.is_in_bounds(pull_index, missing_mask) and self.read_field(bc_mask, pull_index, 0) == wp.uint8(255):
+                if (self.helper_masker.is_in_bounds(pull_index_data, missing_mask)) and (
+                    self.read_field_neighbor(bc_mask, index, pull_index_handle, 0) == wp.uint8(255)
+                ):
                     self.write_field(missing_mask, index, l, wp.uint8(True))
 
         kernel_dic = {
diff --git a/xlb/operator/operator.py b/xlb/operator/operator.py
index 30a06c32..b0906e81 100644
--- a/xlb/operator/operator.py
+++ b/xlb/operator/operator.py
@@ -31,6 +31,7 @@ def __init__(self, velocity_set=None, precision_policy=None, compute_backend=Non
         # Construct read/write functions for the compute backend
         if self.compute_backend in [ComputeBackend.WARP, ComputeBackend.NEON]:
             self.read_field, self.write_field = self._construct_read_write_functions()
+            self.read_field_neighbor = self._construct_read_field_neighbor()
 
         # Construct the kernel based compute_backend functions TODO: Maybe move this to the register or something
         if self.compute_backend == ComputeBackend.WARP:
@@ -214,3 +215,37 @@ def write_field(
             raise ValueError(f"Unsupported compute backend: {self.compute_backend}")
 
         return read_field, write_field
+
+    def _construct_read_field_neighbor(self):
+        """
+        Construct a function to read a field value at a neighboring index along a given direction.
+        """
+        if self.compute_backend == ComputeBackend.WARP:
+
+            @wp.func
+            def read_field_neighbor(
+                field: Any,
+                index: Any,
+                neighbor: Any,
+                direction: Any,
+            ):
+                # This function reads a field value at a given neighboring index and direction.
+                return field[direction, neighbor[0], neighbor[1], neighbor[2]]
+
+        elif self.compute_backend == ComputeBackend.NEON:
+
+            @wp.func
+            def read_field_neighbor(
+                field: Any,
+                index: Any,
+                neighbor: Any,
+                direction: Any,
+            ):
+                # This function reads a field value at a given neighboring index and direction.
+                unused_is_valid = wp.bool(False)
+                return wp.neon_read_ngh(field, index, neighbor, direction, self.compute_dtype(0.0), unused_is_valid)
+
+        else:
+            raise ValueError(f"Unsupported compute backend: {self.compute_backend}")
+
+        return read_field_neighbor

From 21a1014cae41018496c1003dfc9b9f57fec3c2d2 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Thu, 19 Jun 2025 22:35:36 -0400
Subject: [PATCH 084/208] made individual functionals for each kernel in
 indices masker

---
 .../indices_boundary_masker.py                | 130 ++++++++++++------
 1 file changed, 91 insertions(+), 39 deletions(-)

diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 0c9a07fb..b7131b91 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -53,6 +53,16 @@ def are_indices_in_interior(self, indices, shape):
         shape_array = np.array(shape)
         return np.all((indices[:_d] > 0) & (indices[:_d] < shape_array[:_d, np.newaxis] - 1), axis=0)
 
+    def _find_bclist_interior(self, bclist, bc_mask):
+        bc_interior = []
+        grid_shape = self.helper_masker.get_grid_shape(bc_mask)
+        for bc in bclist:
+            if any(self.are_indices_in_interior(np.array(bc.indices), grid_shape)):
+                bc_copy = copy.copy(bc)  # shallow copy of the whole object
+                bc_copy.indices = copy.deepcopy(bc.pad_indices())  # deep copy only the modified part
+                bc_interior.append(bc_copy)
+        return bc_interior, grid_shape
+
     @Operator.register_backend(ComputeBackend.JAX)
     # TODO HS: figure out why uncommenting the line below fails unlike other operators!
     # @partial(jit, static_argnums=(0))
@@ -126,25 +136,15 @@ def _construct_warp(self):
         # Make constants for warp
         _q = self.velocity_set.q
 
-        # Construct the warp 3D kernel
-        @wp.kernel
-        def kernel_domain_bounds(
-            indices: wp.array2d(dtype=wp.int32),
-            id_number: wp.array1d(dtype=wp.uint8),
-            is_interior: wp.array1d(dtype=wp.uint8),
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.uint8),
+        @wp.func
+        def functional_domain_bounds(
+            index: Any,
+            _id_number: Any,
+            _is_interior: Any,
+            bc_mask: Any,
+            missing_mask: Any,
         ):
-            # Get the index of indices
-            ii = wp.tid()
-
-            # Get local indices
-            index = wp.vec3i()
-            index[0] = indices[0, ii]
-            index[1] = indices[1, ii]
-            index[2] = indices[2, ii]
-
-            if is_interior[ii] == wp.uint8(True):
+            if _is_interior == wp.uint8(True):
                 # If the index is in the interior, we set that index to be a solid node (identified by 255)
                 # This information will be used in the next kernel to identify missing directions using the
                 # padded indices of the solid node that are associated with the boundary condition.
@@ -154,7 +154,7 @@ def kernel_domain_bounds(
             # Check if index is in bounds
             if self.helper_masker.is_in_bounds(index, missing_mask):
                 # Set bc_mask for all bc indices
-                self.write_field(bc_mask, index, 0, wp.uint8(id_number[ii]))
+                self.write_field(bc_mask, index, 0, wp.uint8(_id_number))
 
                 # Stream indices
                 for l in range(_q):
@@ -167,6 +167,58 @@ def kernel_domain_bounds(
                         # Set the missing mask
                         self.write_field(missing_mask, index, l, wp.uint8(True))
 
+        @wp.func
+        def functional_interior_bc_mask(
+            index: Any,
+            _id_number: Any,
+            bc_mask: Any,
+        ):
+            # Set bc_mask for all interior bc indices
+            self.write_field(bc_mask, index, 0, wp.uint8(_id_number))
+
+        @wp.func
+        def functional_interior_missing_mask(
+            index: Any,
+            bc_mask: Any,
+            missing_mask: Any,
+        ):
+            for l in range(_q):
+                # Get the index of the streaming direction
+                pull_index_data, pull_index_handle = self.helper_masker.get_pull_index(bc_mask, l, index)
+
+                # Check if pull index is a fluid node (bc_mask is zero for fluid nodes)
+                if (self.helper_masker.is_in_bounds(pull_index_data, missing_mask)) and (
+                    self.read_field_neighbor(bc_mask, index, pull_index_handle, 0) == wp.uint8(255)
+                ):
+                    self.write_field(missing_mask, index, l, wp.uint8(True))
+
+        # Construct the warp 3D kernel
+        @wp.kernel
+        def kernel_domain_bounds(
+            indices: wp.array2d(dtype=wp.int32),
+            id_number: wp.array1d(dtype=wp.uint8),
+            is_interior: wp.array1d(dtype=wp.uint8),
+            bc_mask: wp.array4d(dtype=wp.uint8),
+            missing_mask: wp.array4d(dtype=wp.uint8),
+        ):
+            # Get the index of indices
+            ii = wp.tid()
+
+            # Get local indices
+            index = wp.vec3i()
+            index[0] = indices[0, ii]
+            index[1] = indices[1, ii]
+            index[2] = indices[2, ii]
+
+            # Call the functional
+            functional_domain_bounds(
+                index,
+                id_number[ii],
+                is_interior[ii],
+                bc_mask,
+                missing_mask,
+            )
+
         @wp.kernel
         def kernel_interior_bc_mask(
             indices: wp.array2d(dtype=wp.int32),
@@ -183,7 +235,11 @@ def kernel_interior_bc_mask(
             index[2] = indices[2, ii]
 
             # Set bc_mask for all interior bc indices
-            self.write_field(bc_mask, index, 0, wp.uint8(id_number[ii]))
+            functional_interior_bc_mask(
+                index,
+                id_number[ii],
+                bc_mask,
+            )
             return
 
         @wp.kernel
@@ -201,22 +257,23 @@ def kernel_interior_missing_mask(
             index[1] = indices[1, ii]
             index[2] = indices[2, ii]
 
-            for l in range(_q):
-                # Get the index of the streaming direction
-                pull_index_data, pull_index_handle = self.helper_masker.get_pull_index(bc_mask, l, index)
-
-                # Check if pull index is a fluid node (bc_mask is zero for fluid nodes)
-                if (self.helper_masker.is_in_bounds(pull_index_data, missing_mask)) and (
-                    self.read_field_neighbor(bc_mask, index, pull_index_handle, 0) == wp.uint8(255)
-                ):
-                    self.write_field(missing_mask, index, l, wp.uint8(True))
+            functional_interior_missing_mask(
+                index,
+                bc_mask,
+                missing_mask,
+            )
 
-        kernel_dic = {
+        functional_dict = {
+            "functional_domain_bounds": functional_domain_bounds,
+            "functional_interior_bc_mask": functional_interior_bc_mask,
+            "functional_interior_missing_mask": functional_interior_missing_mask,
+        }
+        kernel_dict = {
             "kernel_domain_bounds": kernel_domain_bounds,
             "kernel_interior_bc_mask": kernel_interior_bc_mask,
             "kernel_interior_missing_mask": kernel_interior_missing_mask,
         }
-        return None, kernel_dic
+        return functional_dict, kernel_dict
 
     def _prepare_kernel_inputs(self, bclist, grid_shape):
         """
@@ -272,14 +329,9 @@ def _prepare_kernel_inputs(self, bclist, grid_shape):
 
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
-        # prepare warp kernel inputs
-        bc_interior = []
-        grid_shape = self.helper_masker.get_grid_shape(bc_mask)
-        for bc in bclist:
-            if any(self.are_indices_in_interior(np.array(bc.indices), grid_shape)):
-                bc_copy = copy.copy(bc)  # shallow copy of the whole object
-                bc_copy.indices = copy.deepcopy(bc.pad_indices())  # deep copy only the modified part
-                bc_interior.append(bc_copy)
+
+        # find interior boundary conditions
+        bc_interior, grid_shape = self._find_bclist_interior(bclist, bc_mask)
 
         # Prepare the first kernel inputs for all items in boundary condition list
         total_index, wp_indices, wp_id_numbers, wp_is_interior = self._prepare_kernel_inputs(bclist, grid_shape)

From eca7208c47dfb3dc328abdf22cda29bc959dddd0 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 20 Jun 2025 11:27:36 -0400
Subject: [PATCH 085/208] Converted indices masker to be launched over the
 entire grid and not just bc indices. This is needed for consistency with Neon
 and also mesh masker.

---
 .../indices_boundary_masker.py                | 149 ++++++++++--------
 1 file changed, 79 insertions(+), 70 deletions(-)

diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index b7131b91..aae8e50c 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -139,126 +139,136 @@ def _construct_warp(self):
         @wp.func
         def functional_domain_bounds(
             index: Any,
-            _id_number: Any,
-            _is_interior: Any,
+            bc_indices: Any,
+            id_number: Any,
+            is_interior: Any,
             bc_mask: Any,
             missing_mask: Any,
         ):
-            if _is_interior == wp.uint8(True):
-                # If the index is in the interior, we set that index to be a solid node (identified by 255)
-                # This information will be used in the next kernel to identify missing directions using the
-                # padded indices of the solid node that are associated with the boundary condition.
-                self.write_field(bc_mask, index, 0, wp.uint8(255))
-                return
-
-            # Check if index is in bounds
-            if self.helper_masker.is_in_bounds(index, missing_mask):
-                # Set bc_mask for all bc indices
-                self.write_field(bc_mask, index, 0, wp.uint8(_id_number))
-
-                # Stream indices
-                for l in range(_q):
-                    # Get the pull index which is the index of the neighboring node where information is pulled from
-                    pull_index, _ = self.helper_masker.get_pull_index(bc_mask, l, index)
-
-                    # Check if pull index is out of bound
-                    # These directions will have missing information after streaming
-                    if not self.helper_masker.is_in_bounds(pull_index, missing_mask):
-                        # Set the missing mask
-                        self.write_field(missing_mask, index, l, wp.uint8(True))
+            for ii in range(bc_indices.shape[1]):
+                # If the current index does not match the boundary condition index, we skip it
+                if not (bc_indices[0, ii] == index[0] and bc_indices[1, ii] == index[1] and bc_indices[2, ii] == index[2]):
+                    continue
+
+                if is_interior[ii] == wp.uint8(True):
+                    # If the index is in the interior, we set that index to be a solid node (identified by 255)
+                    # This information will be used in the next kernel to identify missing directions using the
+                    # padded indices of the solid node that are associated with the boundary condition.
+                    self.write_field(bc_mask, index, 0, wp.uint8(255))
+                    return
+
+                # Check if index is in bounds
+                if self.helper_masker.is_in_bounds(index, missing_mask):
+                    # Set bc_mask for all bc indices
+                    self.write_field(bc_mask, index, 0, wp.uint8(id_number[ii]))
+
+                    # Stream indices
+                    for l in range(_q):
+                        # Get the pull index which is the index of the neighboring node where information is pulled from
+                        pull_index, _ = self.helper_masker.get_pull_index(bc_mask, l, index)
+
+                        # Check if pull index is out of bound
+                        # These directions will have missing information after streaming
+                        if not self.helper_masker.is_in_bounds(pull_index, missing_mask):
+                            # Set the missing mask
+                            self.write_field(missing_mask, index, l, wp.uint8(True))
 
         @wp.func
         def functional_interior_bc_mask(
             index: Any,
-            _id_number: Any,
+            bc_indices: Any,
+            id_number: Any,
             bc_mask: Any,
         ):
-            # Set bc_mask for all interior bc indices
-            self.write_field(bc_mask, index, 0, wp.uint8(_id_number))
+            for ii in range(bc_indices.shape[1]):
+                # If the current index does not match the boundary condition index, we skip it
+                if not (bc_indices[0, ii] == index[0] and bc_indices[1, ii] == index[1] and bc_indices[2, ii] == index[2]):
+                    continue
+                # Set bc_mask for all interior bc indices
+                self.write_field(bc_mask, index, 0, wp.uint8(id_number[ii]))
 
         @wp.func
         def functional_interior_missing_mask(
             index: Any,
+            bc_indices: Any,
             bc_mask: Any,
             missing_mask: Any,
         ):
-            for l in range(_q):
-                # Get the index of the streaming direction
-                pull_index_data, pull_index_handle = self.helper_masker.get_pull_index(bc_mask, l, index)
+            for ii in range(bc_indices.shape[1]):
+                # If the current index does not match the boundary condition index, we skip it
+                if not (bc_indices[0, ii] == index[0] and bc_indices[1, ii] == index[1] and bc_indices[2, ii] == index[2]):
+                    continue
+                for l in range(_q):
+                    # Get the index of the streaming direction
+                    pull_index_data, pull_index_handle = self.helper_masker.get_pull_index(bc_mask, l, index)
 
-                # Check if pull index is a fluid node (bc_mask is zero for fluid nodes)
-                if (self.helper_masker.is_in_bounds(pull_index_data, missing_mask)) and (
-                    self.read_field_neighbor(bc_mask, index, pull_index_handle, 0) == wp.uint8(255)
-                ):
-                    self.write_field(missing_mask, index, l, wp.uint8(True))
+                    # Check if pull index is a fluid node (bc_mask is zero for fluid nodes)
+                    if (self.helper_masker.is_in_bounds(pull_index_data, missing_mask)) and (
+                        self.read_field_neighbor(bc_mask, index, pull_index_handle, 0) == wp.uint8(255)
+                    ):
+                        self.write_field(missing_mask, index, l, wp.uint8(True))
 
         # Construct the warp 3D kernel
         @wp.kernel
         def kernel_domain_bounds(
-            indices: wp.array2d(dtype=wp.int32),
+            bc_indices: wp.array2d(dtype=wp.int32),
             id_number: wp.array1d(dtype=wp.uint8),
             is_interior: wp.array1d(dtype=wp.uint8),
             bc_mask: wp.array4d(dtype=wp.uint8),
             missing_mask: wp.array4d(dtype=wp.uint8),
         ):
-            # Get the index of indices
-            ii = wp.tid()
+            # get index
+            i, j, k = wp.tid()
 
             # Get local indices
-            index = wp.vec3i()
-            index[0] = indices[0, ii]
-            index[1] = indices[1, ii]
-            index[2] = indices[2, ii]
+            index = wp.vec3i(i, j, k)
 
             # Call the functional
             functional_domain_bounds(
                 index,
-                id_number[ii],
-                is_interior[ii],
+                bc_indices,
+                id_number,
+                is_interior,
                 bc_mask,
                 missing_mask,
             )
 
         @wp.kernel
         def kernel_interior_bc_mask(
-            indices: wp.array2d(dtype=wp.int32),
+            bc_indices: wp.array2d(dtype=wp.int32),
             id_number: wp.array1d(dtype=wp.uint8),
             bc_mask: wp.array4d(dtype=wp.uint8),
         ):
-            # Get the index of indices
-            ii = wp.tid()
+            # get index
+            i, j, k = wp.tid()
 
             # Get local indices
-            index = wp.vec3i()
-            index[0] = indices[0, ii]
-            index[1] = indices[1, ii]
-            index[2] = indices[2, ii]
+            index = wp.vec3i(i, j, k)
 
             # Set bc_mask for all interior bc indices
             functional_interior_bc_mask(
                 index,
-                id_number[ii],
+                bc_indices,
+                id_number,
                 bc_mask,
             )
             return
 
         @wp.kernel
         def kernel_interior_missing_mask(
-            indices: wp.array2d(dtype=wp.int32),
+            bc_indices: wp.array2d(dtype=wp.int32),
             bc_mask: wp.array4d(dtype=wp.uint8),
             missing_mask: wp.array4d(dtype=wp.uint8),
         ):
-            # Get the index of indices
-            ii = wp.tid()
+            # get index
+            i, j, k = wp.tid()
 
             # Get local indices
-            index = wp.vec3i()
-            index[0] = indices[0, ii]
-            index[1] = indices[1, ii]
-            index[2] = indices[2, ii]
+            index = wp.vec3i(i, j, k)
 
             functional_interior_missing_mask(
                 index,
+                bc_indices,
                 bc_mask,
                 missing_mask,
             )
@@ -322,26 +332,25 @@ def _prepare_kernel_inputs(self, bclist, grid_shape):
         is_interior = is_interior[:total_index]
 
         # Convert to Warp arrays
-        wp_indices = wp.array(indices, dtype=wp.int32)
+        wp_bc_indices = wp.array(indices, dtype=wp.int32)
         wp_id_numbers = wp.array(id_numbers, dtype=wp.uint8)
         wp_is_interior = wp.array(is_interior, dtype=wp.uint8)
-        return total_index, wp_indices, wp_id_numbers, wp_is_interior
+        return wp_bc_indices, wp_id_numbers, wp_is_interior
 
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
-
         # find interior boundary conditions
         bc_interior, grid_shape = self._find_bclist_interior(bclist, bc_mask)
 
         # Prepare the first kernel inputs for all items in boundary condition list
-        total_index, wp_indices, wp_id_numbers, wp_is_interior = self._prepare_kernel_inputs(bclist, grid_shape)
+        wp_bc_indices, wp_id_numbers, wp_is_interior = self._prepare_kernel_inputs(bclist, grid_shape)
 
         # Launch the warp kernel
         wp.launch(
             self.warp_kernel["kernel_domain_bounds"],
-            dim=total_index,
+            dim=bc_mask.shape[1:],
             inputs=[
-                wp_indices,
+                wp_bc_indices,
                 wp_id_numbers,
                 wp_is_interior,
                 bc_mask,
@@ -351,21 +360,21 @@ def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
         # Prepare the second and third kernel inputs for only a subset of boundary conditions associated with the interior
         # Note 1: launching order of the following kernels are important here!
         # Note 2: Due to race conditioning, the two kernels cannot be fused together.
-        total_index, wp_indices, wp_id_numbers, _ = self._prepare_kernel_inputs(bc_interior, grid_shape)
+        wp_bc_indices, wp_id_numbers, _ = self._prepare_kernel_inputs(bc_interior, grid_shape)
         wp.launch(
             self.warp_kernel["kernel_interior_missing_mask"],
-            dim=total_index,
+            dim=bc_mask.shape[1:],
             inputs=[
-                wp_indices,
+                wp_bc_indices,
                 bc_mask,
                 missing_mask,
             ],
         )
         wp.launch(
             self.warp_kernel["kernel_interior_bc_mask"],
-            dim=total_index,
+            dim=bc_mask.shape[1:],
             inputs=[
-                wp_indices,
+                wp_bc_indices,
                 wp_id_numbers,
                 bc_mask,
             ],

From 878a1f37d03acfa85df683621466bfd8d677d2ee Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 20 Jun 2025 11:45:22 -0400
Subject: [PATCH 086/208] added a helper to check if a grid index matches a bc
 index

---
 .../helper_functions_masker.py                | 20 +++++++++++++++++++
 .../indices_boundary_masker.py                |  6 +++---
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/xlb/operator/boundary_masker/helper_functions_masker.py b/xlb/operator/boundary_masker/helper_functions_masker.py
index 774dd578..2dc2ee8f 100644
--- a/xlb/operator/boundary_masker/helper_functions_masker.py
+++ b/xlb/operator/boundary_masker/helper_functions_masker.py
@@ -90,10 +90,30 @@ def get_pull_index_neon(
             pull_index_neon = wp.neon_ngh_idx(wp.int8(-_c[0, lattice_dir]), wp.int8(-_c[1, lattice_dir]), wp.int8(-_c[2, lattice_dir]))
             return pull_index_wp, pull_index_neon
 
+        @wp.func
+        def is_in_bc_indices_warp(
+            field: wp.array4d(dtype=wp.uint8),
+            index: wp.vec3i,
+            bc_indices: wp.array2d(dtype=wp.int32),
+            ii: wp.int32,
+        ):
+            return bc_indices[0, ii] == index[0] and bc_indices[1, ii] == index[1] and bc_indices[2, ii] == index[2]
+
+        @wp.func
+        def is_in_bc_indices_neon(
+            field: Any,
+            index: Any,
+            bc_indices: wp.array2d(dtype=wp.int32),
+            ii: wp.int32,
+        ):
+            index_wp = neon_index_to_warp(field, index)
+            return is_in_bc_indices_warp(field, index_wp, bc_indices, ii)
+
         # Construct some helper warp functions
         self.is_in_bounds = is_in_bounds
         self.index_to_position = index_to_position_warp if self.compute_backend == ComputeBackend.WARP else index_to_position_neon
         self.get_pull_index = get_pull_index_warp if self.compute_backend == ComputeBackend.WARP else get_pull_index_neon
+        self.is_in_bc_indices = is_in_bc_indices_warp if self.compute_backend == ComputeBackend.WARP else is_in_bc_indices_neon
 
     def get_grid_shape(self, bc_mask):
         """
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index aae8e50c..5b658bd8 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -147,7 +147,7 @@ def functional_domain_bounds(
         ):
             for ii in range(bc_indices.shape[1]):
                 # If the current index does not match the boundary condition index, we skip it
-                if not (bc_indices[0, ii] == index[0] and bc_indices[1, ii] == index[1] and bc_indices[2, ii] == index[2]):
+                if not self.helper_masker.is_in_bc_indices(bc_mask, index, bc_indices, ii):
                     continue
 
                 if is_interior[ii] == wp.uint8(True):
@@ -182,7 +182,7 @@ def functional_interior_bc_mask(
         ):
             for ii in range(bc_indices.shape[1]):
                 # If the current index does not match the boundary condition index, we skip it
-                if not (bc_indices[0, ii] == index[0] and bc_indices[1, ii] == index[1] and bc_indices[2, ii] == index[2]):
+                if not self.helper_masker.is_in_bc_indices(bc_mask, index, bc_indices, ii):
                     continue
                 # Set bc_mask for all interior bc indices
                 self.write_field(bc_mask, index, 0, wp.uint8(id_number[ii]))
@@ -196,7 +196,7 @@ def functional_interior_missing_mask(
         ):
             for ii in range(bc_indices.shape[1]):
                 # If the current index does not match the boundary condition index, we skip it
-                if not (bc_indices[0, ii] == index[0] and bc_indices[1, ii] == index[1] and bc_indices[2, ii] == index[2]):
+                if not self.helper_masker.is_in_bc_indices(bc_mask, index, bc_indices, ii):
                     continue
                 for l in range(_q):
                     # Get the index of the streaming direction

From 9a9a92962f017d1dacc1c0ac3c4f3eda7ea099ed Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 20 Jun 2025 19:07:42 -0400
Subject: [PATCH 087/208] Added neon containers to indices masker. Working now
 but still lots of refactoring to be done.

---
 .../bc_halfway_bounce_back.py                 |   2 +-
 .../helper_functions_masker.py                |  37 ++-
 .../indices_boundary_masker.py                | 218 ++++++++++++------
 .../boundary_masker/mesh_boundary_masker.py   |   7 +-
 xlb/operator/operator.py                      |   8 +-
 5 files changed, 177 insertions(+), 95 deletions(-)

diff --git a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
index 39f18de2..3d9edeab 100644
--- a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
@@ -81,7 +81,7 @@ def __init__(
                 raise ValueError("Velocity prescribed_value must be a tuple, list, or array")
 
             # Create a constant prescribed profile function
-            if self.compute_backend == ComputeBackend.WARP:
+            if self.compute_backend in [ComputeBackend.WARP, ComputeBackend.NEON]:
                 if self.velocity_set.d == 2:
                     prescribed_value = np.array([prescribed_value[0], prescribed_value[1], 0.0], dtype=np.float64)
                 prescribed_value = wp.vec(3, dtype=self.precision_policy.store_precision.wp_dtype)(prescribed_value)
diff --git a/xlb/operator/boundary_masker/helper_functions_masker.py b/xlb/operator/boundary_masker/helper_functions_masker.py
index 2dc2ee8f..d7feef9b 100644
--- a/xlb/operator/boundary_masker/helper_functions_masker.py
+++ b/xlb/operator/boundary_masker/helper_functions_masker.py
@@ -51,8 +51,7 @@ def index_to_position_neon(field: Any, index: Any):
             return index_to_position_warp(field, index_wp)
 
         @wp.func
-        def is_in_bounds(index: wp.vec3i, field: wp.array4d(dtype=wp.uint8)):
-            grid_shape = wp.vec3i(field.shape[1], field.shape[2], field.shape[3])
+        def is_in_bounds(index: wp.vec3i, grid_shape: wp.vec3i, field: Any):
             return (
                 index[0] >= 0
                 and index[0] < grid_shape[0]
@@ -62,38 +61,36 @@ def is_in_bounds(index: wp.vec3i, field: wp.array4d(dtype=wp.uint8)):
                 and index[2] < grid_shape[2]
             )
 
-        @wp.func
-        def get_grid_shape_neon(bc_mask: wp.array4d(dtype=wp.uint8)):
-            return bc_mask.get_grid().dim.x, bc_mask.get_grid().dim.y, bc_mask.get_grid().dim.z
-
         @wp.func
         def get_pull_index_warp(
-            field: wp.array4d(dtype=wp.uint8),
+            field: Any,
             lattice_dir: wp.int32,
             index: wp.vec3i,
         ):
             pull_index = wp.vec3i()
+            offset = wp.vec3i()
             for d in range(self.velocity_set.d):
-                pull_index[d] = index[d] - _c[d, lattice_dir]
+                offset[d] = -_c[d, lattice_dir]
+                pull_index[d] = index[d] + offset[d]
 
-            return pull_index, pull_index
+            return pull_index, offset
 
         @wp.func
         def get_pull_index_neon(
-            field: wp.array4d(dtype=wp.uint8),
+            field: Any,
             lattice_dir: wp.int32,
-            index: wp.vec3i,
+            index: Any,
         ):
             # Convert the index to warp
             index_wp = neon_index_to_warp(field, index)
-            pull_index_wp = get_pull_index_warp(field, lattice_dir, index_wp)
-            pull_index_neon = wp.neon_ngh_idx(wp.int8(-_c[0, lattice_dir]), wp.int8(-_c[1, lattice_dir]), wp.int8(-_c[2, lattice_dir]))
-            return pull_index_wp, pull_index_neon
+            pull_index_wp, _ = get_pull_index_warp(field, lattice_dir, index_wp)
+            offset = wp.neon_ngh_idx(wp.int8(-_c[0, lattice_dir]), wp.int8(-_c[1, lattice_dir]), wp.int8(-_c[2, lattice_dir]))
+            return pull_index_wp, offset
 
         @wp.func
         def is_in_bc_indices_warp(
-            field: wp.array4d(dtype=wp.uint8),
-            index: wp.vec3i,
+            field: Any,
+            index: Any,
             bc_indices: wp.array2d(dtype=wp.int32),
             ii: wp.int32,
         ):
@@ -115,13 +112,13 @@ def is_in_bc_indices_neon(
         self.get_pull_index = get_pull_index_warp if self.compute_backend == ComputeBackend.WARP else get_pull_index_neon
         self.is_in_bc_indices = is_in_bc_indices_warp if self.compute_backend == ComputeBackend.WARP else is_in_bc_indices_neon
 
-    def get_grid_shape(self, bc_mask):
+    def get_grid_shape(self, field):
         """
-        Get the grid shape from the boundary mask.
+        Get the grid shape from the boundary mask. This is a CPU function that returns the shape of the grid
         """
         if self.compute_backend == ComputeBackend.WARP:
-            return bc_mask.shape[1:]
+            return field.shape[1:]
         elif self.compute_backend == ComputeBackend.NEON:
-            return self.get_grid_shape_neon(bc_mask)
+            return wp.vec3i(field.get_grid().dim.x, field.get_grid().dim.y, field.get_grid().dim.z)
         else:
             raise ValueError(f"Unsupported compute backend: {self.compute_backend}")
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 5b658bd8..54706d0d 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -26,9 +26,6 @@ def __init__(
         precision_policy=None,
         compute_backend=None,
     ):
-        # Make stream operator
-        self.stream = Stream(velocity_set, precision_policy, compute_backend)
-
         # Call super
         super().__init__(velocity_set, precision_policy, compute_backend)
 
@@ -39,6 +36,9 @@ def __init__(
                 precision_policy=self.precision_policy,
                 compute_backend=self.compute_backend,
             )
+        else:
+            # Make stream operator
+            self.stream = Stream(velocity_set, precision_policy, compute_backend)
 
     def are_indices_in_interior(self, indices, shape):
         """
@@ -144,6 +144,7 @@ def functional_domain_bounds(
             is_interior: Any,
             bc_mask: Any,
             missing_mask: Any,
+            grid_shape: Any,
         ):
             for ii in range(bc_indices.shape[1]):
                 # If the current index does not match the boundary condition index, we skip it
@@ -157,21 +158,19 @@ def functional_domain_bounds(
                     self.write_field(bc_mask, index, 0, wp.uint8(255))
                     return
 
-                # Check if index is in bounds
-                if self.helper_masker.is_in_bounds(index, missing_mask):
-                    # Set bc_mask for all bc indices
-                    self.write_field(bc_mask, index, 0, wp.uint8(id_number[ii]))
+                # Set bc_mask for all bc indices
+                self.write_field(bc_mask, index, 0, wp.uint8(id_number[ii]))
 
-                    # Stream indices
-                    for l in range(_q):
-                        # Get the pull index which is the index of the neighboring node where information is pulled from
-                        pull_index, _ = self.helper_masker.get_pull_index(bc_mask, l, index)
+                # Stream indices
+                for l in range(_q):
+                    # Get the pull index which is the index of the neighboring node where information is pulled from
+                    pull_index, _ = self.helper_masker.get_pull_index(bc_mask, l, index)
 
-                        # Check if pull index is out of bound
-                        # These directions will have missing information after streaming
-                        if not self.helper_masker.is_in_bounds(pull_index, missing_mask):
-                            # Set the missing mask
-                            self.write_field(missing_mask, index, l, wp.uint8(True))
+                    # Check if pull index is out of bound
+                    # These directions will have missing information after streaming
+                    if not self.helper_masker.is_in_bounds(pull_index, grid_shape, missing_mask):
+                        # Set the missing mask
+                        self.write_field(missing_mask, index, l, wp.uint8(True))
 
         @wp.func
         def functional_interior_bc_mask(
@@ -193,6 +192,7 @@ def functional_interior_missing_mask(
             bc_indices: Any,
             bc_mask: Any,
             missing_mask: Any,
+            grid_shape: Any,
         ):
             for ii in range(bc_indices.shape[1]):
                 # If the current index does not match the boundary condition index, we skip it
@@ -200,11 +200,12 @@ def functional_interior_missing_mask(
                     continue
                 for l in range(_q):
                     # Get the index of the streaming direction
-                    pull_index_data, pull_index_handle = self.helper_masker.get_pull_index(bc_mask, l, index)
+                    pull_index, offset = self.helper_masker.get_pull_index(bc_mask, l, index)
 
                     # Check if pull index is a fluid node (bc_mask is zero for fluid nodes)
-                    if (self.helper_masker.is_in_bounds(pull_index_data, missing_mask)) and (
-                        self.read_field_neighbor(bc_mask, index, pull_index_handle, 0) == wp.uint8(255)
+                    bc_mask_ngh = self.read_field_neighbor(bc_mask, index, offset, 0)
+                    if (self.helper_masker.is_in_bounds(pull_index, grid_shape, missing_mask)) and (
+                        bc_mask_ngh == wp.uint8(255)
                     ):
                         self.write_field(missing_mask, index, l, wp.uint8(True))
 
@@ -216,6 +217,7 @@ def kernel_domain_bounds(
             is_interior: wp.array1d(dtype=wp.uint8),
             bc_mask: wp.array4d(dtype=wp.uint8),
             missing_mask: wp.array4d(dtype=wp.uint8),
+            grid_shape: wp.vec3i,
         ):
             # get index
             i, j, k = wp.tid()
@@ -231,6 +233,7 @@ def kernel_domain_bounds(
                 is_interior,
                 bc_mask,
                 missing_mask,
+                grid_shape,
             )
 
         @wp.kernel
@@ -259,6 +262,7 @@ def kernel_interior_missing_mask(
             bc_indices: wp.array2d(dtype=wp.int32),
             bc_mask: wp.array4d(dtype=wp.uint8),
             missing_mask: wp.array4d(dtype=wp.uint8),
+            grid_shape: wp.vec3i,
         ):
             # get index
             i, j, k = wp.tid()
@@ -271,6 +275,7 @@ def kernel_interior_missing_mask(
                 bc_indices,
                 bc_mask,
                 missing_mask,
+                grid_shape
             )
 
         functional_dict = {
@@ -355,6 +360,7 @@ def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
                 wp_is_interior,
                 bc_mask,
                 missing_mask,
+                grid_shape
             ],
         )
         # Prepare the second and third kernel inputs for only a subset of boundary conditions associated with the interior
@@ -368,6 +374,7 @@ def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
                 wp_bc_indices,
                 bc_mask,
                 missing_mask,
+                grid_shape
             ],
         )
         wp.launch(
@@ -382,65 +389,140 @@ def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
 
         return bc_mask, missing_mask
 
-    @Operator.register_backend(ComputeBackend.NEON)
-    def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
-        # Make constants
-        _d = self.velocity_set.d
+    def _construct_neon(self):
+        # Use the warp functional for the NEON backend
+        functional_dict, _ = self._construct_warp()
+        functional_domain_bounds = functional_dict.get("functional_domain_bounds")
+        functional_interior_bc_mask = functional_dict.get("functional_interior_bc_mask")
+        functional_interior_missing_mask = functional_dict.get("functional_interior_missing_mask")
+
+        @neon.Container.factory(name="IndicesBoundaryMasker_DomainBounds")
+        def container_domain_bounds(
+            wp_bc_indices,
+            wp_id_numbers,
+            wp_is_interior,
+            bc_mask,
+            missing_mask,
+            grid_shape,
+        ):
+            def domain_bounds_launcher(loader: neon.Loader):
+                loader.set_grid(bc_mask.get_grid())
+                bc_mask_pn = loader.get_write_handle(bc_mask)
+                missing_mask_pn = loader.get_write_handle(missing_mask)
 
-        # Pre-allocate arrays with maximum possible size
-        grid_shape = bc_mask.get_grid().dim.x, bc_mask.get_grid().dim.y, bc_mask.get_grid().dim.z
-        grid_warp = grid_factory(grid_shape, compute_backend=ComputeBackend.WARP, velocity_set=self.velocity_set)
-        missing_mask_warp = grid_warp.create_field(cardinality=self.velocity_set.q, dtype=Precision.UINT8)
-        bc_mask_warp = grid_warp.create_field(cardinality=1, dtype=Precision.UINT8)
-
-        # Use indices masker with the warp backend to build bc_mask_warp and missing_mask_warp before writing in Neon DS.
-        indices_masker_warp = IndicesBoundaryMasker(
-            velocity_set=self.velocity_set,
-            precision_policy=self.precision_policy,
-            compute_backend=ComputeBackend.WARP,
-        )
-        bc_mask_warp, missing_mask_warp = indices_masker_warp(bclist, bc_mask_warp, missing_mask_warp, start_index)
-        wp.synchronize()
-
-        @neon.Container.factory("")
-        def container(
-            bc_mask_warp: Any,
-            missing_mask_warp: Any,
-            bc_mask_field: Any,
-            missing_mask_field: Any,
+                @wp.func
+                def domain_bounds_kernel(index: Any):
+                    # apply the functional
+                    functional_domain_bounds(
+                        index,
+                        wp_bc_indices,
+                        wp_id_numbers,
+                        wp_is_interior,
+                        bc_mask_pn,
+                        missing_mask_pn,
+                        grid_shape,
+                    )
+
+                loader.declare_kernel(domain_bounds_kernel)
+
+            return domain_bounds_launcher
+
+        @neon.Container.factory(name="IndicesBoundaryMasker_InteriorBcMask")
+        def container_interior_bc_mask(
+            wp_bc_indices,
+            wp_id_numbers,
+            bc_mask,
         ):
-            def loading_step(loader: neon.Loader):
-                loader.set_grid(bc_mask_field.get_grid())
-                bc_mask_hdl = loader.get_write_handle(bc_mask_field)
-                missing_mask_hdl = loader.get_write_handle(missing_mask_field)
+            def interior_bc_mask_launcher(loader: neon.Loader):
+                loader.set_grid(bc_mask.get_grid())
+                bc_mask_pn = loader.get_write_handle(bc_mask)
 
                 @wp.func
-                def masker(gridIdx: Any):
-                    cIdx = wp.neon_global_idx(bc_mask_hdl, gridIdx)
-                    gx = wp.neon_get_x(cIdx)
-                    gy = wp.neon_get_y(cIdx)
-                    gz = wp.neon_get_z(cIdx)
+                def interior_bc_mask_kernel(index: Any):
+                    # apply the functional
+                    functional_interior_bc_mask(
+                        index,
+                        wp_bc_indices,
+                        wp_id_numbers,
+                        bc_mask_pn,
+                    )
+
+                loader.declare_kernel(interior_bc_mask_kernel)
+
+            return interior_bc_mask_launcher
+
+        @neon.Container.factory(name="IndicesBoundaryMasker_InteriorMissingMask")
+        def container_interior_missing_mask(
+            wp_bc_indices,
+            bc_mask,
+            missing_mask,
+            grid_shape,
+        ):
+            def interior_bc_mask_launcher(loader: neon.Loader):
+                loader.set_grid(bc_mask.get_grid())
+                bc_mask_pn = loader.get_write_handle(bc_mask)
+                missing_mask_pn = loader.get_write_handle(missing_mask)
 
-                    # TODO@Max - XLB is flattening the z dimension in 3D, while neon uses the y dimension
-                    if _d == 2:
-                        gy, gz = gz, gy
+                @wp.func
+                def interior_missing_mask_kernel(index: Any):
+                    # apply the functional
+                    functional_interior_missing_mask(
+                        index,
+                        wp_bc_indices,
+                        bc_mask_pn,
+                        missing_mask_pn,
+                        grid_shape,
+                    )
+
+                loader.declare_kernel(interior_missing_mask_kernel)
+
+            return interior_bc_mask_launcher
+
+        container_dict = {
+            "container_domain_bounds": container_domain_bounds,
+            "container_interior_bc_mask": container_interior_bc_mask,
+            "container_interior_missing_mask": container_interior_missing_mask,
+        }
 
-                    local_mask = bc_mask_warp[0, gx, gy, gz]
-                    wp.neon_write(bc_mask_hdl, gridIdx, 0, local_mask)
+        return functional_dict, container_dict
 
-                    for q in range(self.velocity_set.q):
-                        is_missing = wp.uint8(missing_mask_warp[q, gx, gy, gz])
-                        wp.neon_write(missing_mask_hdl, gridIdx, q, is_missing)
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
+        # find interior boundary conditions
+        bc_interior, grid_shape = self._find_bclist_interior(bclist, bc_mask)
 
-                loader.declare_kernel(masker)
+        # Prepare the first kernel inputs for all items in boundary condition list
+        wp_bc_indices, wp_id_numbers, wp_is_interior = self._prepare_kernel_inputs(bclist, grid_shape)
 
-            return loading_step
+        # Launch the first container
+        container_domain_bounds = self.neon_container["container_domain_bounds"](
+            wp_bc_indices,
+            wp_id_numbers,
+            wp_is_interior,
+            bc_mask,
+            missing_mask,
+            grid_shape,
+        )
+        container_domain_bounds.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
 
-        c = container(bc_mask_warp, missing_mask_warp, bc_mask, missing_mask)
-        c.run(0)
-        wp.synchronize()
+        # Prepare the second and third kernel inputs for only a subset of boundary conditions associated with the interior
+        # Note 1: launching order of the following kernels are important here!
+        # Note 2: Due to race conditioning, the two kernels cannot be fused together.
+        wp_bc_indices, wp_id_numbers, _ = self._prepare_kernel_inputs(bc_interior, grid_shape)
+        container_interior_missing_mask = self.neon_container["container_interior_missing_mask"](
+            wp_bc_indices,
+            bc_mask,
+            missing_mask,
+            grid_shape
+        )
+        container_interior_missing_mask.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
 
-        del bc_mask_warp
-        del missing_mask_warp
+        # Launch the third container
+        container_interior_bc_mask = self.neon_container["container_interior_bc_mask"](
+            wp_bc_indices,
+            wp_id_numbers,
+            bc_mask,
+        )
+        container_interior_bc_mask.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
 
         return bc_mask, missing_mask
diff --git a/xlb/operator/boundary_masker/mesh_boundary_masker.py b/xlb/operator/boundary_masker/mesh_boundary_masker.py
index fd620afd..1447cbd5 100644
--- a/xlb/operator/boundary_masker/mesh_boundary_masker.py
+++ b/xlb/operator/boundary_masker/mesh_boundary_masker.py
@@ -50,6 +50,7 @@ def out_of_bound_pull_index(
             lattice_dir: wp.int32,
             index: wp.vec3i,
             field: wp.array4d(dtype=wp.uint8),
+            grid_shape: wp.vec3i,
         ):
             # Get the index of the streaming direction
             pull_index = wp.vec3i()
@@ -58,7 +59,7 @@ def out_of_bound_pull_index(
 
             # check if pull index is out of bound
             # These directions will have missing information after streaming
-            missing = not self.helper_masker.is_in_bounds(pull_index, field)
+            missing = not self.helper_masker.is_in_bounds(pull_index, grid_shape, field)
             return missing
 
         # Function to precompute useful values per triangle, assuming spacing is (1,1,1)
@@ -160,13 +161,13 @@ def resolve_out_of_bound_kernel(
             index = wp.vec3i(i, j, k)
 
             # domain shape to check for out of bounds
-            domain_shape = wp.vec3i(bc_mask.shape[1], bc_mask.shape[2], bc_mask.shape[3])
+            grid_shape = wp.vec3i(bc_mask.shape[1], bc_mask.shape[2], bc_mask.shape[3])
 
             # Find the fractional distance to the mesh in each direction
             if bc_mask[0, index[0], index[1], index[2]] == wp.uint8(id_number):
                 for l in range(1, _q):
                     # Ensuring out of bound pull indices are properly considered in the missing_mask
-                    if out_of_bound_pull_index(l, index, missing_mask):
+                    if out_of_bound_pull_index(l, index, missing_mask, grid_shape):
                         missing_mask[l, index[0], index[1], index[2]] = wp.uint8(True)
 
         # Construct some helper warp functions
diff --git a/xlb/operator/operator.py b/xlb/operator/operator.py
index b0906e81..37eb1705 100644
--- a/xlb/operator/operator.py
+++ b/xlb/operator/operator.py
@@ -226,10 +226,11 @@ def _construct_read_field_neighbor(self):
             def read_field_neighbor(
                 field: Any,
                 index: Any,
-                neighbor: Any,
+                offset: Any,
                 direction: Any,
             ):
                 # This function reads a field value at a given neighboring index and direction.
+                neighbor = index + offset
                 return field[direction, neighbor[0], neighbor[1], neighbor[2]]
 
         elif self.compute_backend == ComputeBackend.NEON:
@@ -238,12 +239,13 @@ def read_field_neighbor(
             def read_field_neighbor(
                 field: Any,
                 index: Any,
-                neighbor: Any,
+                offset: Any,
                 direction: Any,
             ):
                 # This function reads a field value at a given neighboring index and direction.
                 unused_is_valid = wp.bool(False)
-                return wp.neon_read_ngh(field, index, neighbor, direction, self.compute_dtype(0.0), unused_is_valid)
+                # TODO: the type of the returned value should be the same as field's dtype
+                return wp.neon_read_ngh(field, index, offset, direction, wp.uint8(0), unused_is_valid)
 
         else:
             raise ValueError(f"Unsupported compute backend: {self.compute_backend}")

From 4283637fad066af2dab30d462bf5680f64d2b21d Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 23 Jun 2025 10:43:24 -0400
Subject: [PATCH 088/208] WIP: added multi-res indice masker

---
 xlb/operator/boundary_masker/__init__.py      |   1 +
 .../indices_boundary_masker.py                |  28 ++-
 .../multires_indices_boundary_masker.py       | 198 ++++++++++++++++++
 xlb/operator/stepper/nse_multires_stepper.py  |   9 +-
 4 files changed, 224 insertions(+), 12 deletions(-)
 create mode 100644 xlb/operator/boundary_masker/multires_indices_boundary_masker.py

diff --git a/xlb/operator/boundary_masker/__init__.py b/xlb/operator/boundary_masker/__init__.py
index fc1f2b45..9041fc41 100644
--- a/xlb/operator/boundary_masker/__init__.py
+++ b/xlb/operator/boundary_masker/__init__.py
@@ -8,3 +8,4 @@
 from xlb.operator.boundary_masker.mesh_voxelization_method import MeshVoxelizationMethod
 from xlb.operator.boundary_masker.multires_boundary_masker import MultiresBoundaryMasker
 from xlb.operator.boundary_masker.multires_aabb import MultiresMeshMaskerAABB
+from xlb.operator.boundary_masker.multires_indices_boundary_masker import MultiresIndicesBoundaryMasker
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 54706d0d..1807f210 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -53,15 +53,14 @@ def are_indices_in_interior(self, indices, shape):
         shape_array = np.array(shape)
         return np.all((indices[:_d] > 0) & (indices[:_d] < shape_array[:_d, np.newaxis] - 1), axis=0)
 
-    def _find_bclist_interior(self, bclist, bc_mask):
+    def _find_bclist_interior(self, bclist, grid_shape):
         bc_interior = []
-        grid_shape = self.helper_masker.get_grid_shape(bc_mask)
         for bc in bclist:
             if any(self.are_indices_in_interior(np.array(bc.indices), grid_shape)):
                 bc_copy = copy.copy(bc)  # shallow copy of the whole object
                 bc_copy.indices = copy.deepcopy(bc.pad_indices())  # deep copy only the modified part
                 bc_interior.append(bc_copy)
-        return bc_interior, grid_shape
+        return bc_interior
 
     @Operator.register_backend(ComputeBackend.JAX)
     # TODO HS: figure out why uncommenting the line below fails unlike other operators!
@@ -344,8 +343,11 @@ def _prepare_kernel_inputs(self, bclist, grid_shape):
 
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
-        # find interior boundary conditions
-        bc_interior, grid_shape = self._find_bclist_interior(bclist, bc_mask)
+        # get the grid shape
+        grid_shape = self.helper_masker.get_grid_shape(bc_mask)
+
+        # Find interior boundary conditions
+        bc_interior = self._find_bclist_interior(bclist, grid_shape)
 
         # Prepare the first kernel inputs for all items in boundary condition list
         wp_bc_indices, wp_id_numbers, wp_is_interior = self._prepare_kernel_inputs(bclist, grid_shape)
@@ -363,6 +365,11 @@ def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
                 grid_shape
             ],
         )
+
+        # If there are no interior boundary conditions, skip the rest and retun early
+        if not bc_interior:
+            return bc_mask, missing_mask
+
         # Prepare the second and third kernel inputs for only a subset of boundary conditions associated with the interior
         # Note 1: launching order of the following kernels are important here!
         # Note 2: Due to race conditioning, the two kernels cannot be fused together.
@@ -488,8 +495,11 @@ def interior_missing_mask_kernel(index: Any):
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
-        # find interior boundary conditions
-        bc_interior, grid_shape = self._find_bclist_interior(bclist, bc_mask)
+        # get the grid shape
+        grid_shape = self.helper_masker.get_grid_shape(bc_mask)
+
+        # Find interior boundary conditions
+        bc_interior = self._find_bclist_interior(bclist, grid_shape)
 
         # Prepare the first kernel inputs for all items in boundary condition list
         wp_bc_indices, wp_id_numbers, wp_is_interior = self._prepare_kernel_inputs(bclist, grid_shape)
@@ -505,6 +515,10 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
         )
         container_domain_bounds.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
 
+        # If there are no interior boundary conditions, skip the rest and retun early
+        if not bc_interior:
+            return bc_mask, missing_mask
+
         # Prepare the second and third kernel inputs for only a subset of boundary conditions associated with the interior
         # Note 1: launching order of the following kernels are important here!
         # Note 2: Due to race conditioning, the two kernels cannot be fused together.
diff --git a/xlb/operator/boundary_masker/multires_indices_boundary_masker.py b/xlb/operator/boundary_masker/multires_indices_boundary_masker.py
new file mode 100644
index 00000000..0f9a75c3
--- /dev/null
+++ b/xlb/operator/boundary_masker/multires_indices_boundary_masker.py
@@ -0,0 +1,198 @@
+from typing import Any
+import copy
+
+import warp as wp
+
+from xlb.operator.operator import Operator
+from xlb.velocity_set.velocity_set import VelocitySet
+from xlb.precision_policy import PrecisionPolicy
+from xlb.compute_backend import ComputeBackend
+from xlb.operator.boundary_masker import IndicesBoundaryMasker
+import neon
+
+
+class MultiresIndicesBoundaryMasker(IndicesBoundaryMasker):
+    """
+    Operator for creating a boundary mask using indices of boundary conditions in a multi-resolution setting.
+    """
+
+    def __init__(
+        self,
+        velocity_set: VelocitySet,
+        precision_policy: PrecisionPolicy,
+        compute_backend: ComputeBackend.WARP,
+    ):
+        # Call super
+        super().__init__(velocity_set, precision_policy, compute_backend)
+        if self.compute_backend in [ComputeBackend.JAX, ComputeBackend.WARP]:
+            raise NotImplementedError(f"Operator {self.__class__.__name__} not supported in {self.compute_backend} backend.")
+
+    def _construct_neon(self):
+        # Use the warp functional for the NEON backend
+        functional_dict, _ = self._construct_warp()
+        functional_domain_bounds = functional_dict.get("functional_domain_bounds")
+        functional_interior_bc_mask = functional_dict.get("functional_interior_bc_mask")
+        functional_interior_missing_mask = functional_dict.get("functional_interior_missing_mask")
+
+        @neon.Container.factory(name="IndicesBoundaryMasker_DomainBounds")
+        def container_domain_bounds(
+            wp_bc_indices,
+            wp_id_numbers,
+            wp_is_interior,
+            bc_mask,
+            missing_mask,
+            grid_shape,
+            level,
+        ):
+            def domain_bounds_launcher(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask.get_grid(), level)
+                bc_mask_pn = loader.get_mres_write_handle(bc_mask)
+                missing_mask_pn = loader.get_mres_write_handle(missing_mask)
+
+                @wp.func
+                def domain_bounds_kernel(index: Any):
+                    # apply the functional
+                    functional_domain_bounds(
+                        index,
+                        wp_bc_indices,
+                        wp_id_numbers,
+                        wp_is_interior,
+                        bc_mask_pn,
+                        missing_mask_pn,
+                        grid_shape,
+                    )
+
+                loader.declare_kernel(domain_bounds_kernel)
+
+            return domain_bounds_launcher
+
+        @neon.Container.factory(name="IndicesBoundaryMasker_InteriorBcMask")
+        def container_interior_bc_mask(
+            wp_bc_indices,
+            wp_id_numbers,
+            bc_mask,
+            level,
+        ):
+            def interior_bc_mask_launcher(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask.get_grid(), level)
+                bc_mask_pn = loader.get_mres_write_handle(bc_mask)
+
+                @wp.func
+                def interior_bc_mask_kernel(index: Any):
+                    # apply the functional
+                    functional_interior_bc_mask(
+                        index,
+                        wp_bc_indices,
+                        wp_id_numbers,
+                        bc_mask_pn,
+                    )
+
+                loader.declare_kernel(interior_bc_mask_kernel)
+
+            return interior_bc_mask_launcher
+
+        @neon.Container.factory(name="IndicesBoundaryMasker_InteriorMissingMask")
+        def container_interior_missing_mask(
+            wp_bc_indices,
+            bc_mask,
+            missing_mask,
+            grid_shape,
+            level,
+        ):
+            def interior_bc_mask_launcher(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask.get_grid(), level)
+                bc_mask_pn = loader.get_mres_write_handle(bc_mask)
+                missing_mask_pn = loader.get_mres_write_handle(missing_mask)
+
+                @wp.func
+                def interior_missing_mask_kernel(index: Any):
+                    # apply the functional
+                    functional_interior_missing_mask(
+                        index,
+                        wp_bc_indices,
+                        bc_mask_pn,
+                        missing_mask_pn,
+                        grid_shape,
+                    )
+
+                loader.declare_kernel(interior_missing_mask_kernel)
+
+            return interior_bc_mask_launcher
+
+        container_dict = {
+            "container_domain_bounds": container_domain_bounds,
+            "container_interior_bc_mask": container_interior_bc_mask,
+            "container_interior_missing_mask": container_interior_missing_mask,
+        }
+
+        return functional_dict, container_dict
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
+
+        grid = bc_mask.get_grid()
+        num_levels = grid.num_levels
+        grid_shape_finest = self.helper_masker.get_grid_shape(bc_mask)
+        for level in range(num_levels):
+
+            # Create a copy of the boundary condition list for the current level if the indices at that level are not empty
+            bclist_at_level = []
+            for bc in bclist:
+                if bc.indices is not None and bc.indices[level]:
+                    bc_copy = copy.copy(bc)  # shallow copy of the whole object
+                    bc_copy.indices = copy.deepcopy(bc.indices[level])  # deep copy only the modified part
+                    bclist_at_level.append(bc_copy)
+
+            # If the boundary condition list is empty, skip to the next level
+            if not bclist_at_level:
+                continue
+
+            # find grid shape at current level
+            grid_shape_tuple = tuple([shape//2 ** level for shape in grid_shape_finest])
+            grid_shape_warp = wp.vec3i(*grid_shape_tuple)
+
+            # find interior boundary conditions
+            bc_interior = self._find_bclist_interior(bclist_at_level, grid_shape_tuple)
+
+            # Prepare the first kernel inputs for all items in boundary condition list
+            wp_bc_indices, wp_id_numbers, wp_is_interior = self._prepare_kernel_inputs(bclist_at_level, grid_shape_tuple)
+
+            # Launch the first container
+            container_domain_bounds = self.neon_container["container_domain_bounds"](
+                wp_bc_indices,
+                wp_id_numbers,
+                wp_is_interior,
+                bc_mask,
+                missing_mask,
+                grid_shape_warp,
+                level,
+            )
+            container_domain_bounds.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
+
+            # If there are no interior boundary conditions, skip the rest of the processing for this level
+            if not bc_interior:
+                continue
+
+            # Prepare the second and third kernel inputs for only a subset of boundary conditions associated with the interior
+            # Note 1: launching order of the following kernels are important here!
+            # Note 2: Due to race conditioning, the two kernels cannot be fused together.
+            wp_bc_indices, wp_id_numbers, _ = self._prepare_kernel_inputs(bc_interior, grid_shape_tuple)
+            container_interior_missing_mask = self.neon_container["container_interior_missing_mask"](
+                wp_bc_indices,
+                bc_mask,
+                missing_mask,
+                grid_shape_warp,
+                level,
+            )
+            container_interior_missing_mask.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
+
+            # Launch the third container
+            container_interior_bc_mask = self.neon_container["container_interior_bc_mask"](
+                wp_bc_indices,
+                wp_id_numbers,
+                bc_mask,
+                level,
+            )
+            container_interior_bc_mask.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
+
+        return bc_mask, missing_mask
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 67f84561..8950f46e 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -16,12 +16,11 @@
 from xlb.operator.boundary_condition.boundary_condition import ImplementationStep
 from xlb.operator.boundary_condition.boundary_condition_registry import boundary_condition_registry
 from xlb.operator.collision import ForcedCollision
-from xlb.operator.boundary_masker import MultiresBoundaryMasker
 from xlb.helper import check_bc_overlaps
 from xlb.operator.boundary_masker import (
-    IndicesBoundaryMasker,
     MeshVoxelizationMethod,
     MultiresMeshMaskerAABB,
+    MultiresIndicesBoundaryMasker
 )
 
 
@@ -217,7 +216,7 @@ def _process_boundary_conditions(cls, boundary_conditions, f_1, bc_mask, missing
         # TODO! check_bc_overlaps(boundary_conditions, DefaultConfig.velocity_set.d, DefaultConfig.default_backend)
 
         # Create boundary maskers
-        indices_masker = IndicesBoundaryMasker(
+        indices_masker = MultiresIndicesBoundaryMasker(
             velocity_set=DefaultConfig.velocity_set,
             precision_policy=DefaultConfig.default_precision_policy,
             compute_backend=DefaultConfig.default_backend,
@@ -228,8 +227,8 @@ def _process_boundary_conditions(cls, boundary_conditions, f_1, bc_mask, missing
         bc_with_indices = [bc for bc in boundary_conditions if bc.indices is not None]
 
         # Process indices-based boundary conditions
-        # if bc_with_indices:
-        #     bc_mask, missing_mask = indices_masker(bc_with_indices, bc_mask, missing_mask)
+        if bc_with_indices:
+            bc_mask, missing_mask = indices_masker(bc_with_indices, bc_mask, missing_mask)
 
         # Process mesh-based boundary conditions for 3D
         if DefaultConfig.velocity_set.d == 3 and bc_with_vertices:

From ed98ee4121e84dac1620b961b5c2e331b0452a19 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 23 Jun 2025 13:46:21 -0400
Subject: [PATCH 089/208] WIP: Fixed a bug in multires indice masker and
 applied ruff formatting

---
 .../indices_boundary_masker.py                | 35 +++----------------
 .../multires_indices_boundary_masker.py       |  9 ++---
 xlb/operator/stepper/nse_multires_stepper.py  |  7 +---
 3 files changed, 11 insertions(+), 40 deletions(-)

diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 1807f210..9340ad96 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -203,9 +203,7 @@ def functional_interior_missing_mask(
 
                     # Check if pull index is a fluid node (bc_mask is zero for fluid nodes)
                     bc_mask_ngh = self.read_field_neighbor(bc_mask, index, offset, 0)
-                    if (self.helper_masker.is_in_bounds(pull_index, grid_shape, missing_mask)) and (
-                        bc_mask_ngh == wp.uint8(255)
-                    ):
+                    if (self.helper_masker.is_in_bounds(pull_index, grid_shape, missing_mask)) and (bc_mask_ngh == wp.uint8(255)):
                         self.write_field(missing_mask, index, l, wp.uint8(True))
 
         # Construct the warp 3D kernel
@@ -269,13 +267,7 @@ def kernel_interior_missing_mask(
             # Get local indices
             index = wp.vec3i(i, j, k)
 
-            functional_interior_missing_mask(
-                index,
-                bc_indices,
-                bc_mask,
-                missing_mask,
-                grid_shape
-            )
+            functional_interior_missing_mask(index, bc_indices, bc_mask, missing_mask, grid_shape)
 
         functional_dict = {
             "functional_domain_bounds": functional_domain_bounds,
@@ -356,14 +348,7 @@ def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
         wp.launch(
             self.warp_kernel["kernel_domain_bounds"],
             dim=bc_mask.shape[1:],
-            inputs=[
-                wp_bc_indices,
-                wp_id_numbers,
-                wp_is_interior,
-                bc_mask,
-                missing_mask,
-                grid_shape
-            ],
+            inputs=[wp_bc_indices, wp_id_numbers, wp_is_interior, bc_mask, missing_mask, grid_shape],
         )
 
         # If there are no interior boundary conditions, skip the rest and retun early
@@ -377,12 +362,7 @@ def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
         wp.launch(
             self.warp_kernel["kernel_interior_missing_mask"],
             dim=bc_mask.shape[1:],
-            inputs=[
-                wp_bc_indices,
-                bc_mask,
-                missing_mask,
-                grid_shape
-            ],
+            inputs=[wp_bc_indices, bc_mask, missing_mask, grid_shape],
         )
         wp.launch(
             self.warp_kernel["kernel_interior_bc_mask"],
@@ -523,12 +503,7 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
         # Note 1: launching order of the following kernels are important here!
         # Note 2: Due to race conditioning, the two kernels cannot be fused together.
         wp_bc_indices, wp_id_numbers, _ = self._prepare_kernel_inputs(bc_interior, grid_shape)
-        container_interior_missing_mask = self.neon_container["container_interior_missing_mask"](
-            wp_bc_indices,
-            bc_mask,
-            missing_mask,
-            grid_shape
-        )
+        container_interior_missing_mask = self.neon_container["container_interior_missing_mask"](wp_bc_indices, bc_mask, missing_mask, grid_shape)
         container_interior_missing_mask.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
 
         # Launch the third container
diff --git a/xlb/operator/boundary_masker/multires_indices_boundary_masker.py b/xlb/operator/boundary_masker/multires_indices_boundary_masker.py
index 0f9a75c3..3a86041f 100644
--- a/xlb/operator/boundary_masker/multires_indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/multires_indices_boundary_masker.py
@@ -1,5 +1,6 @@
 from typing import Any
 import copy
+import numpy as np
 
 import warp as wp
 
@@ -129,18 +130,18 @@ def interior_missing_mask_kernel(index: Any):
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
-
         grid = bc_mask.get_grid()
         num_levels = grid.num_levels
         grid_shape_finest = self.helper_masker.get_grid_shape(bc_mask)
         for level in range(num_levels):
-
             # Create a copy of the boundary condition list for the current level if the indices at that level are not empty
             bclist_at_level = []
             for bc in bclist:
                 if bc.indices is not None and bc.indices[level]:
                     bc_copy = copy.copy(bc)  # shallow copy of the whole object
-                    bc_copy.indices = copy.deepcopy(bc.indices[level])  # deep copy only the modified part
+                    indices = copy.deepcopy(bc.indices[level])  # deep copy only the modified part
+                    indices = np.array(indices) * 2**level
+                    bc_copy.indices = tuple(indices.tolist())  # convert to tuple
                     bclist_at_level.append(bc_copy)
 
             # If the boundary condition list is empty, skip to the next level
@@ -148,7 +149,7 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
                 continue
 
             # find grid shape at current level
-            grid_shape_tuple = tuple([shape//2 ** level for shape in grid_shape_finest])
+            grid_shape_tuple = tuple([shape // 2**level for shape in grid_shape_finest])
             grid_shape_warp = wp.vec3i(*grid_shape_tuple)
 
             # find interior boundary conditions
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 8950f46e..0bf4c905 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -17,11 +17,7 @@
 from xlb.operator.boundary_condition.boundary_condition_registry import boundary_condition_registry
 from xlb.operator.collision import ForcedCollision
 from xlb.helper import check_bc_overlaps
-from xlb.operator.boundary_masker import (
-    MeshVoxelizationMethod,
-    MultiresMeshMaskerAABB,
-    MultiresIndicesBoundaryMasker
-)
+from xlb.operator.boundary_masker import MeshVoxelizationMethod, MultiresMeshMaskerAABB, MultiresIndicesBoundaryMasker
 
 
 class MultiresIncompressibleNavierStokesStepper(Stepper):
@@ -90,7 +86,6 @@ def prepare_fields(self, rho, u, initializer=None):
         f_0.update_host(0)
         wp.synchronize()
         bc_mask.export_vti("bc_mask.vti", "bc_mask")
-        exit(0)
         # f_0.export_vti("init_f0.vti", 'init_f0')
         # missing_mask.export_vti("missing_mask.vti", 'missing_mask')
 

From d87fe638805073dfd4f07aa98e0c964ba177dc73 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 23 Jun 2025 16:06:13 -0400
Subject: [PATCH 090/208] Fixed a bug related to get global indices

---
 .../multires_indices_boundary_masker.py         | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/xlb/operator/boundary_masker/multires_indices_boundary_masker.py b/xlb/operator/boundary_masker/multires_indices_boundary_masker.py
index 3a86041f..3ee56671 100644
--- a/xlb/operator/boundary_masker/multires_indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/multires_indices_boundary_masker.py
@@ -140,7 +140,7 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
                 if bc.indices is not None and bc.indices[level]:
                     bc_copy = copy.copy(bc)  # shallow copy of the whole object
                     indices = copy.deepcopy(bc.indices[level])  # deep copy only the modified part
-                    indices = np.array(indices) * 2**level
+                    indices = np.array(indices) * 2**level      # TODO: This is a hack
                     bc_copy.indices = tuple(indices.tolist())  # convert to tuple
                     bclist_at_level.append(bc_copy)
 
@@ -149,14 +149,15 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
                 continue
 
             # find grid shape at current level
-            grid_shape_tuple = tuple([shape // 2**level for shape in grid_shape_finest])
-            grid_shape_warp = wp.vec3i(*grid_shape_tuple)
+            # TODO: this is a hack. Should be corrected in the helper function when getting neon global indices
+            grid_shape_at_level = tuple([shape // 2**level for shape in grid_shape_finest])
+            grid_shape_finest_warp = wp.vec3i(*grid_shape_finest)
 
             # find interior boundary conditions
-            bc_interior = self._find_bclist_interior(bclist_at_level, grid_shape_tuple)
+            bc_interior = self._find_bclist_interior(bclist_at_level, grid_shape_at_level)
 
             # Prepare the first kernel inputs for all items in boundary condition list
-            wp_bc_indices, wp_id_numbers, wp_is_interior = self._prepare_kernel_inputs(bclist_at_level, grid_shape_tuple)
+            wp_bc_indices, wp_id_numbers, wp_is_interior = self._prepare_kernel_inputs(bclist_at_level, grid_shape_at_level)
 
             # Launch the first container
             container_domain_bounds = self.neon_container["container_domain_bounds"](
@@ -165,7 +166,7 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
                 wp_is_interior,
                 bc_mask,
                 missing_mask,
-                grid_shape_warp,
+                grid_shape_finest_warp,
                 level,
             )
             container_domain_bounds.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
@@ -177,12 +178,12 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
             # Prepare the second and third kernel inputs for only a subset of boundary conditions associated with the interior
             # Note 1: launching order of the following kernels are important here!
             # Note 2: Due to race conditioning, the two kernels cannot be fused together.
-            wp_bc_indices, wp_id_numbers, _ = self._prepare_kernel_inputs(bc_interior, grid_shape_tuple)
+            wp_bc_indices, wp_id_numbers, _ = self._prepare_kernel_inputs(bc_interior, grid_shape_at_level)
             container_interior_missing_mask = self.neon_container["container_interior_missing_mask"](
                 wp_bc_indices,
                 bc_mask,
                 missing_mask,
-                grid_shape_warp,
+                grid_shape_finest_warp,
                 level,
             )
             container_interior_missing_mask.run(0, container_runtime=neon.Container.ContainerRuntime.neon)

From 7d671d307f9264df56bef0b31cfbadf74157bb10 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 23 Jun 2025 16:35:10 -0400
Subject: [PATCH 091/208] Getting dtype from field using neon_get_dtype

---
 .../boundary_masker/multires_indices_boundary_masker.py     | 2 +-
 xlb/operator/operator.py                                    | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/xlb/operator/boundary_masker/multires_indices_boundary_masker.py b/xlb/operator/boundary_masker/multires_indices_boundary_masker.py
index 3ee56671..6a5c53d5 100644
--- a/xlb/operator/boundary_masker/multires_indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/multires_indices_boundary_masker.py
@@ -140,7 +140,7 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
                 if bc.indices is not None and bc.indices[level]:
                     bc_copy = copy.copy(bc)  # shallow copy of the whole object
                     indices = copy.deepcopy(bc.indices[level])  # deep copy only the modified part
-                    indices = np.array(indices) * 2**level      # TODO: This is a hack
+                    indices = np.array(indices) * 2**level  # TODO: This is a hack
                     bc_copy.indices = tuple(indices.tolist())  # convert to tuple
                     bclist_at_level.append(bc_copy)
 
diff --git a/xlb/operator/operator.py b/xlb/operator/operator.py
index 37eb1705..52e9740a 100644
--- a/xlb/operator/operator.py
+++ b/xlb/operator/operator.py
@@ -220,6 +220,8 @@ def _construct_read_field_neighbor(self):
         """
         Construct a function to read a field value at a neighboring index along a given direction.
         """
+        from neon.multires.mPartition import neon_get_type
+
         if self.compute_backend == ComputeBackend.WARP:
 
             @wp.func
@@ -244,8 +246,8 @@ def read_field_neighbor(
             ):
                 # This function reads a field value at a given neighboring index and direction.
                 unused_is_valid = wp.bool(False)
-                # TODO: the type of the returned value should be the same as field's dtype
-                return wp.neon_read_ngh(field, index, offset, direction, wp.uint8(0), unused_is_valid)
+                dtype = neon_get_type(field)  # This is a placeholder to ensure the dtype is set correctly
+                return wp.neon_read_ngh(field, index, offset, direction, dtype(0.0), unused_is_valid)
 
         else:
             raise ValueError(f"Unsupported compute backend: {self.compute_backend}")

From 0a39df04fbaad7138e804b22e0faf64508937c48 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 24 Jun 2025 10:39:58 -0400
Subject: [PATCH 092/208] deleted the old multires boundary masker

---
 xlb/operator/boundary_masker/__init__.py      |   1 -
 .../multires_boundary_masker.py               | 155 ------------------
 2 files changed, 156 deletions(-)
 delete mode 100644 xlb/operator/boundary_masker/multires_boundary_masker.py

diff --git a/xlb/operator/boundary_masker/__init__.py b/xlb/operator/boundary_masker/__init__.py
index 9041fc41..7cef263f 100644
--- a/xlb/operator/boundary_masker/__init__.py
+++ b/xlb/operator/boundary_masker/__init__.py
@@ -6,6 +6,5 @@
 from xlb.operator.boundary_masker.winding import MeshMaskerWinding
 from xlb.operator.boundary_masker.aabb_fill import MeshMaskerAABBFill
 from xlb.operator.boundary_masker.mesh_voxelization_method import MeshVoxelizationMethod
-from xlb.operator.boundary_masker.multires_boundary_masker import MultiresBoundaryMasker
 from xlb.operator.boundary_masker.multires_aabb import MultiresMeshMaskerAABB
 from xlb.operator.boundary_masker.multires_indices_boundary_masker import MultiresIndicesBoundaryMasker
diff --git a/xlb/operator/boundary_masker/multires_boundary_masker.py b/xlb/operator/boundary_masker/multires_boundary_masker.py
deleted file mode 100644
index 66bd5334..00000000
--- a/xlb/operator/boundary_masker/multires_boundary_masker.py
+++ /dev/null
@@ -1,155 +0,0 @@
-import warp as wp
-import neon, typing, copy
-from xlb.compute_backend import ComputeBackend
-from xlb.operator.operator import Operator
-from xlb.grid import grid_factory
-from xlb.precision_policy import Precision
-from xlb.operator.boundary_masker import (
-    IndicesBoundaryMasker,
-    MeshVoxelizationMethod,
-    MeshMaskerAABB,
-    MeshMaskerRay,
-    MeshMaskerWinding,
-    MeshMaskerAABBFill,
-)
-
-
-class MultiresBoundaryMasker(Operator):
-    """
-    Operator for creating a boundary mask for multi-resolution grids
-    """
-
-    def __init__(
-        self,
-        velocity_set=None,
-        precision_policy=None,
-        compute_backend=None,
-    ):
-        if compute_backend in [ComputeBackend.JAX, ComputeBackend.WARP]:
-            raise NotImplementedError(f"Operator {self.__class__.__name} not supported in {compute_backend} backend.")
-
-        # Call super
-        super().__init__(velocity_set, precision_policy, compute_backend)
-
-        # Create boundary maskers using the WARP backend
-        self.indices_masker = IndicesBoundaryMasker(
-            velocity_set=velocity_set,
-            precision_policy=precision_policy,
-            compute_backend=ComputeBackend.WARP,
-        )
-
-    @Operator.register_backend(ComputeBackend.NEON)
-    def neon_implementation(self, bclist, f_1, bc_mask, missing_mask, start_index=None, xlb_grid=None):
-        # Ensure that this operator is called on multires grids
-        assert bc_mask.get_grid().get_name() == "mGrid", f"Operation {self.__class__.__name} is only applicable to multi-resolution cases"
-
-        # Make constants
-        _d = self.velocity_set.d
-
-        # number of levels
-        num_levels = bc_mask.get_grid().get_num_levels()
-        for level in range(num_levels):
-            # Use the warp backend to create dense fields to be written in multi-res NEON fields
-            refinement = 2**level
-            grid_shape = tuple(x // refinement for x in xlb_grid.shape)
-            grid_dense = grid_factory(grid_shape, compute_backend=ComputeBackend.WARP)
-            missing_mask_warp = grid_dense.create_field(cardinality=self.velocity_set.q, dtype=Precision.UINT8)
-            bc_mask_warp = grid_dense.create_field(cardinality=1, dtype=Precision.UINT8)
-            f_1_warp = grid_dense.create_field(cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision)
-
-            # Set local constants
-            lattice_central_index = self.velocity_set.center_index
-
-            # create a new bclist for this level only
-            bc_with_indices = []
-            for bc in bclist:
-                if bc.indices is not None and bc.indices[level]:
-                    bc_copy = copy.copy(bc)  # shallow copy of the whole object
-                    bc_copy.indices = copy.deepcopy(bc.indices[level])  # deep copy only the modified part
-                    bc_with_indices.append(bc_copy)
-                elif bc.mesh_vertices is not None:
-                    bc_copy = copy.copy(bc)  # shallow copy of the whole object
-                    bc_copy.mesh_vertices = copy.deepcopy(bc.mesh_vertices) / refinement
-
-                    # call mesh masker for this bc at this level
-                    if bc.voxelization_method is MeshVoxelizationMethod.AABB:
-                        mesh_masker = MeshMaskerAABB(
-                            velocity_set=self.velocity_set,
-                            precision_policy=self.precision_policy,
-                            compute_backend=ComputeBackend.WARP,
-                        )
-                    elif bc.voxelization_method is MeshVoxelizationMethod.RAY:
-                        mesh_masker = MeshMaskerRay(
-                            velocity_set=self.velocity_set,
-                            precision_policy=self.precision_policy,
-                            compute_backend=ComputeBackend.WARP,
-                        )
-                    elif bc.voxelization_method is MeshVoxelizationMethod.WINDING:
-                        mesh_masker = MeshMaskerWinding(
-                            velocity_set=self.velocity_set,
-                            precision_policy=self.precision_policy,
-                            compute_backend=ComputeBackend.WARP,
-                        )
-                    elif bc.voxelization_method is MeshVoxelizationMethod.AABB_FILL:
-                        mesh_masker = MeshMaskerAABBFill(
-                            velocity_set=self.velocity_set,
-                            precision_policy=self.precision_policy,
-                            compute_backend=ComputeBackend.WARP,
-                        )
-                    else:
-                        raise ValueError(f"Unsupported voxelization method: {bc.voxelization_method}")
-                    f_1_warp, bc_mask_warp, missing_mask_warp = mesh_masker(bc_copy, f_1_warp, bc_mask_warp, missing_mask_warp)
-
-            # call indices masker for all BC's with indices at this level
-            bc_mask_warp, missing_mask_warp = self.indices_masker(bc_with_indices, bc_mask_warp, missing_mask_warp, start_index)
-
-            @neon.Container.factory(name="MultiresBoundaryMasker")
-            def container(
-                f_1_warp: typing.Any,
-                bc_mask_warp: typing.Any,
-                missing_mask_warp: typing.Any,
-                f_1_field: typing.Any,
-                bc_mask_field: typing.Any,
-                missing_mask_field: typing.Any,
-            ):
-                def loading_step(loader: neon.Loader):
-                    loader.set_mres_grid(bc_mask_field.get_grid(), level)
-                    f_1_hdl = loader.get_mres_write_handle(f_1_field)
-                    bc_mask_hdl = loader.get_mres_write_handle(bc_mask_field)
-                    missing_mask_hdl = loader.get_mres_write_handle(missing_mask_field)
-
-                    @wp.func
-                    def masker(gridIdx: typing.Any):
-                        cIdx = wp.neon_global_idx(bc_mask_hdl, gridIdx)
-                        # get local indices by dividing the global indices (associated with the finest level) by 2^level
-                        lx = wp.neon_get_x(cIdx) // refinement
-                        ly = wp.neon_get_y(cIdx) // refinement
-                        lz = wp.neon_get_z(cIdx) // refinement
-
-                        # TODO@Max - XLB is flattening the z dimension in 3D, while neon uses the y dimension
-                        if _d == 2:
-                            ly, lz = lz, ly
-
-                        local_mask = bc_mask_warp[0, lx, ly, lz]
-                        wp.neon_write(bc_mask_hdl, gridIdx, 0, local_mask)
-
-                        for q in range(self.velocity_set.q):
-                            is_missing = wp.uint8(missing_mask_warp[q, lx, ly, lz])
-                            wp.neon_write(missing_mask_hdl, gridIdx, q, is_missing)
-
-                            if q != lattice_central_index and is_missing == wp.uint8(False):
-                                wp.neon_write(f_1_hdl, gridIdx, q, f_1_warp[q, lx, ly, lz])
-
-                    loader.declare_kernel(masker)
-
-                return loading_step
-
-            c = container(f_1_warp, bc_mask_warp, missing_mask_warp, f_1, bc_mask, missing_mask)
-            c.run(0)
-            wp.synchronize()
-
-            del f_1_warp
-            del bc_mask_warp
-            del missing_mask_warp
-
-        return f_1, bc_mask, missing_mask

From ce28be10fe48bf2760c876033ca1ffb1cdfdbecf Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 24 Jun 2025 12:01:40 -0400
Subject: [PATCH 093/208] Ensuring other voxelization methods work for dense
 and raising error if chosen for multi-res

---
 xlb/operator/boundary_masker/aabb_fill.py     | 24 +++++--------------
 .../helper_functions_masker.py                | 14 +++++------
 .../indices_boundary_masker.py                |  4 ++--
 .../boundary_masker/mesh_boundary_masker.py   |  2 +-
 xlb/operator/boundary_masker/ray.py           |  2 +-
 xlb/operator/boundary_masker/winding.py       |  4 ++--
 xlb/operator/stepper/nse_multires_stepper.py  | 20 +---------------
 7 files changed, 20 insertions(+), 50 deletions(-)

diff --git a/xlb/operator/boundary_masker/aabb_fill.py b/xlb/operator/boundary_masker/aabb_fill.py
index 85c338ce..a0f932a6 100644
--- a/xlb/operator/boundary_masker/aabb_fill.py
+++ b/xlb/operator/boundary_masker/aabb_fill.py
@@ -40,14 +40,8 @@ def _construct_warp(self):
         @wp.kernel
         def erode_tile(f_field: wp.array3d(dtype=Any), f_field_out: wp.array3d(dtype=Any)):
             i, j, k = wp.tid()
-            if (
-                i < TILE_HALF
-                or i >= f_field.shape[0] - TILE_HALF
-                or j < TILE_HALF
-                or j >= f_field.shape[1] - TILE_HALF
-                or k < TILE_HALF
-                or k >= f_field.shape[2] - TILE_HALF
-            ):
+            index = wp.vec3i(i, j, k)
+            if not self.helper_masker.is_in_bounds(index, wp.vec3i(f_field.shape[0], f_field.shape[1], f_field.shape[2]), TILE_HALF):
                 f_field_out[i, j, k] = f_field[i, j, k]
                 return
             t = wp.tile_load(f_field, shape=(TILE_SIZE, TILE_SIZE, TILE_SIZE), offset=(i - TILE_HALF, j - TILE_HALF, k - TILE_HALF))
@@ -58,14 +52,8 @@ def erode_tile(f_field: wp.array3d(dtype=Any), f_field_out: wp.array3d(dtype=Any
         @wp.kernel
         def dilate_tile(f_field: wp.array3d(dtype=Any), f_field_out: wp.array3d(dtype=Any)):
             i, j, k = wp.tid()
-            if (
-                i < TILE_HALF
-                or i >= f_field.shape[0] - TILE_HALF
-                or j < TILE_HALF
-                or j >= f_field.shape[1] - TILE_HALF
-                or k < TILE_HALF
-                or k >= f_field.shape[2] - TILE_HALF
-            ):
+            index = wp.vec3i(i, j, k)
+            if not self.helper_masker.is_in_bounds(index, wp.vec3i(f_field.shape[0], f_field.shape[1], f_field.shape[2]), TILE_HALF):
                 f_field_out[i, j, k] = f_field[i, j, k]
                 return
             t = wp.tile_load(f_field, shape=(TILE_SIZE, TILE_SIZE, TILE_SIZE), offset=(i - TILE_HALF, j - TILE_HALF, k - TILE_HALF))
@@ -87,7 +75,7 @@ def kernel_solid(
             index = wp.vec3i(i, j, k)
 
             # position of the point
-            cell_center_pos = self.index_to_position(index) + offset
+            cell_center_pos = self.helper_masker.index_to_position(solid_mask, index) + offset
             half = wp.vec3(0.5, 0.5, 0.5)
 
             if self.mesh_voxel_intersect(mesh_id=mesh_id, low=cell_center_pos - half):
@@ -112,7 +100,7 @@ def kernel(
             index = wp.vec3i(i, j, k)
 
             # position of the point
-            cell_center_pos = self.index_to_position(index)
+            cell_center_pos = self.helper_masker.index_to_position(bc_mask, index)
 
             if solid_mask[i, j, k] == wp.uint8(255) or bc_mask[0, index[0], index[1], index[2]] == wp.uint8(255):
                 # Make solid voxel
diff --git a/xlb/operator/boundary_masker/helper_functions_masker.py b/xlb/operator/boundary_masker/helper_functions_masker.py
index d7feef9b..f34a8ad5 100644
--- a/xlb/operator/boundary_masker/helper_functions_masker.py
+++ b/xlb/operator/boundary_masker/helper_functions_masker.py
@@ -51,14 +51,14 @@ def index_to_position_neon(field: Any, index: Any):
             return index_to_position_warp(field, index_wp)
 
         @wp.func
-        def is_in_bounds(index: wp.vec3i, grid_shape: wp.vec3i, field: Any):
+        def is_in_bounds(index: wp.vec3i, grid_shape: wp.vec3i, SHIFT: Any = 0):
             return (
-                index[0] >= 0
-                and index[0] < grid_shape[0]
-                and index[1] >= 0
-                and index[1] < grid_shape[1]
-                and index[2] >= 0
-                and index[2] < grid_shape[2]
+                index[0] >= SHIFT
+                and index[0] < grid_shape[0] - SHIFT
+                and index[1] >= SHIFT
+                and index[1] < grid_shape[1] - SHIFT
+                and index[2] >= SHIFT
+                and index[2] < grid_shape[2] - SHIFT
             )
 
         @wp.func
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 9340ad96..f15ba26b 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -167,7 +167,7 @@ def functional_domain_bounds(
 
                     # Check if pull index is out of bound
                     # These directions will have missing information after streaming
-                    if not self.helper_masker.is_in_bounds(pull_index, grid_shape, missing_mask):
+                    if not self.helper_masker.is_in_bounds(pull_index, grid_shape):
                         # Set the missing mask
                         self.write_field(missing_mask, index, l, wp.uint8(True))
 
@@ -203,7 +203,7 @@ def functional_interior_missing_mask(
 
                     # Check if pull index is a fluid node (bc_mask is zero for fluid nodes)
                     bc_mask_ngh = self.read_field_neighbor(bc_mask, index, offset, 0)
-                    if (self.helper_masker.is_in_bounds(pull_index, grid_shape, missing_mask)) and (bc_mask_ngh == wp.uint8(255)):
+                    if (self.helper_masker.is_in_bounds(pull_index, grid_shape)) and (bc_mask_ngh == wp.uint8(255)):
                         self.write_field(missing_mask, index, l, wp.uint8(True))
 
         # Construct the warp 3D kernel
diff --git a/xlb/operator/boundary_masker/mesh_boundary_masker.py b/xlb/operator/boundary_masker/mesh_boundary_masker.py
index 1447cbd5..e7f3f488 100644
--- a/xlb/operator/boundary_masker/mesh_boundary_masker.py
+++ b/xlb/operator/boundary_masker/mesh_boundary_masker.py
@@ -59,7 +59,7 @@ def out_of_bound_pull_index(
 
             # check if pull index is out of bound
             # These directions will have missing information after streaming
-            missing = not self.helper_masker.is_in_bounds(pull_index, grid_shape, field)
+            missing = not self.helper_masker.is_in_bounds(pull_index, grid_shape)
             return missing
 
         # Function to precompute useful values per triangle, assuming spacing is (1,1,1)
diff --git a/xlb/operator/boundary_masker/ray.py b/xlb/operator/boundary_masker/ray.py
index 4cfefea9..a1d83580 100644
--- a/xlb/operator/boundary_masker/ray.py
+++ b/xlb/operator/boundary_masker/ray.py
@@ -43,7 +43,7 @@ def kernel(
             index = wp.vec3i(i, j, k)
 
             # position of the point
-            cell_center_pos = self.index_to_position(index)
+            cell_center_pos = self.helper_masker.index_to_position(bc_mask, index)
 
             # Find the fractional distance to the mesh in each direction
             for direction_idx in range(1, _q):
diff --git a/xlb/operator/boundary_masker/winding.py b/xlb/operator/boundary_masker/winding.py
index f38e9cdf..328e528d 100644
--- a/xlb/operator/boundary_masker/winding.py
+++ b/xlb/operator/boundary_masker/winding.py
@@ -43,7 +43,7 @@ def kernel(
             index = wp.vec3i(i, j, k)
 
             # position of the point
-            pos_cell = self.index_to_position(index)
+            pos_cell = self.helper_masker.index_to_position(bc_mask, index)
 
             # Compute the maximum length
             max_length = wp.sqrt(
@@ -80,7 +80,7 @@ def kernel(
 
                             # get position of the mesh triangle that intersects with the ray
                             pos_mesh = wp.mesh_eval_position(mesh_id, query_dir.face, query_dir.u, query_dir.v)
-                            cell_center_pos = self.index_to_position(push_index)
+                            cell_center_pos = self.helper_masker.index_to_position(bc_mask, push_index)
                             dist = wp.length(pos_mesh - cell_center_pos)
                             weight = self.store_dtype(dist / max_length)
                             distances[_opp_indices[direction_idx], push_index[0], push_index[1], push_index[2]] = weight
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 0bf4c905..e7896b50 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -234,26 +234,8 @@ def _process_boundary_conditions(cls, boundary_conditions, f_1, bc_mask, missing
                         precision_policy=DefaultConfig.default_precision_policy,
                         compute_backend=DefaultConfig.default_backend,
                     )
-                # elif bc.voxelization_method is MeshVoxelizationMethod.RAY:
-                #     mesh_masker = MeshMaskerRay(
-                #         velocity_set=DefaultConfig.velocity_set,
-                #         precision_policy=DefaultConfig.default_precision_policy,
-                #         compute_backend=DefaultConfig.default_backend,
-                #     )
-                # elif bc.voxelization_method is MeshVoxelizationMethod.WINDING:
-                #     mesh_masker = MeshMaskerWinding(
-                #         velocity_set=DefaultConfig.velocity_set,
-                #         precision_policy=DefaultConfig.default_precision_policy,
-                #         compute_backend=DefaultConfig.default_backend,
-                #     )
-                # elif bc.voxelization_method is MeshVoxelizationMethod.AABB_FILL:
-                #     mesh_masker = MeshMaskerAABBFill(
-                #         velocity_set=DefaultConfig.velocity_set,
-                #         precision_policy=DefaultConfig.default_precision_policy,
-                #         compute_backend=DefaultConfig.default_backend,
-                #     )
                 else:
-                    raise ValueError(f"Unsupported voxelization method: {bc.voxelization_method}")
+                    raise ValueError(f"Unsupported voxelization method for multi-res: {bc.voxelization_method}")
                 # Apply the mesh masker to the boundary condition
                 f_1, bc_mask, missing_mask = mesh_masker(bc, f_1, bc_mask, missing_mask)
 

From b5cb130386ee4e409988af00d7e1783200945621 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Wed, 25 Jun 2025 09:37:11 -0400
Subject: [PATCH 094/208] moved the computations of the momentum exchange to a
 functional and used read_field

---
 xlb/operator/force/momentum_transfer.py | 55 ++++++++++++++++---------
 1 file changed, 35 insertions(+), 20 deletions(-)

diff --git a/xlb/operator/force/momentum_transfer.py b/xlb/operator/force/momentum_transfer.py
index 601c0fe9..dc0df83e 100644
--- a/xlb/operator/force/momentum_transfer.py
+++ b/xlb/operator/force/momentum_transfer.py
@@ -101,28 +101,20 @@ def _construct_warp(self):
         # Find velocity index for 0, 0, 0
         lattice_central_index = self.velocity_set.center_index
 
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel(
-            f_0: wp.array4d(dtype=Any),
-            f_1: wp.array4d(dtype=Any),
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.uint8),
-            force: wp.array(dtype=Any),
+        @wp.func
+        def functional(
+            index: Any,
+            f_0: Any,
+            f_1: Any,
+            bc_mask: Any,
+            missing_mask: Any,
+            force: Any,
         ):
-            # Get the global index
-            i, j, k = wp.tid()
-            index = wp.vec3i(i, j, k)
-
             # Get the boundary id
-            _boundary_id = bc_mask[0, index[0], index[1], index[2]]
+            _boundary_id = self.read_field(bc_mask, index, 0)
             _missing_mask = _missing_mask_vec()
             for l in range(self.velocity_set.q):
-                # TODO fix vec bool
-                if missing_mask[l, index[0], index[1], index[2]]:
-                    _missing_mask[l] = wp.uint8(1)
-                else:
-                    _missing_mask[l] = wp.uint8(0)
+                _missing_mask[l] = self.read_field(missing_mask, index, l)
 
             # Determin if boundary is an edge by checking if center is missing
             is_edge = wp.bool(False)
@@ -136,7 +128,7 @@ def kernel(
                 # Get the distribution function
                 f_post_collision = _f_vec()
                 for l in range(self.velocity_set.q):
-                    f_post_collision[l] = self.compute_dtype(f_0[l, index[0], index[1], index[2]])
+                    f_post_collision[l] = self.compute_dtype(self.read_field(f_0, index, l))
 
                 # Apply streaming (pull method)
                 timestep = 0
@@ -153,9 +145,32 @@ def kernel(
                                 m[d] += phi
                             elif _c[d, _opp_indices[l]] == -1:
                                 m[d] -= phi
-
+            # Atomic sum to get the total force vector
             wp.atomic_add(force, 0, m)
 
+        # Construct the warp kernel
+        @wp.kernel
+        def kernel(
+            f_0: wp.array4d(dtype=Any),
+            f_1: wp.array4d(dtype=Any),
+            bc_mask: wp.array4d(dtype=wp.uint8),
+            missing_mask: wp.array4d(dtype=wp.uint8),
+            force: wp.array(dtype=Any),
+        ):
+            # Get the global index
+            i, j, k = wp.tid()
+            index = wp.vec3i(i, j, k)
+
+            # Call the functional to compute the force
+            functional(
+                index,
+                f_0,
+                f_1,
+                bc_mask,
+                missing_mask,
+                force,
+            )
+
         return None, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)

From d637ec15640e010f214dc853cc5fb37d50ea082a Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Wed, 25 Jun 2025 09:52:21 -0400
Subject: [PATCH 095/208] moved the definition of streaming functional to 
 construct warp

---
 xlb/operator/force/momentum_transfer.py |  5 ++++-
 xlb/operator/stream/stream.py           | 27 +++----------------------
 2 files changed, 7 insertions(+), 25 deletions(-)

diff --git a/xlb/operator/force/momentum_transfer.py b/xlb/operator/force/momentum_transfer.py
index dc0df83e..9784ef18 100644
--- a/xlb/operator/force/momentum_transfer.py
+++ b/xlb/operator/force/momentum_transfer.py
@@ -132,7 +132,7 @@ def functional(
 
                 # Apply streaming (pull method)
                 timestep = 0
-                f_post_stream = self.stream.warp_functional(f_0, index)
+                f_post_stream = self.stream_functional(f_0, index)
                 f_post_stream = self.no_slip_bc_instance.warp_functional(index, timestep, _missing_mask, f_0, f_1, f_post_collision, f_post_stream)
 
                 # Compute the momentum transfer
@@ -179,6 +179,9 @@ def warp_implementation(self, f_0, f_1, bc_mask, missing_mask):
         _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
         force = wp.zeros((1), dtype=_u_vec)
 
+        # Define the warp functional for streaming operation
+        self.stream_functional = self.stream.warp_functional
+
         # Launch the warp kernel
         wp.launch(
             self.warp_kernel,
diff --git a/xlb/operator/stream/stream.py b/xlb/operator/stream/stream.py
index 8626d101..3773ea4e 100644
--- a/xlb/operator/stream/stream.py
+++ b/xlb/operator/stream/stream.py
@@ -127,35 +127,14 @@ def functional(
             # Pull the distribution function
             _f = _f_vec()
             for l in range(self.velocity_set.q):
-                # Get pull index
-                # pull_index = type(index)()
-                # for d in range(self.velocity_set.d):
-                #     pull_index[d] = index[d] - _c[d, l]
-
+                # Get pull offset
                 ngh = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
-
                 unused_is_valid = wp.bool(False)
-                _f[l] = wp.neon_read_ngh(f, index, ngh, l, self.compute_dtype(0), unused_is_valid)
 
+                # Read the distribution function from the neighboring cell in the pull direction
+                _f[l] = wp.neon_read_ngh(f, index, ngh, l, self.compute_dtype(0), unused_is_valid)
             return _f
 
-        # # Construct the warp kernel
-        # @wp.kernel
-        # def kernel(
-        #     f_0: wp.array4d(dtype=Any),
-        #     f_1: wp.array4d(dtype=Any),
-        # ):
-        #     # Get the global index
-        #     i, j, k = wp.tid()
-        #     index = wp.vec3i(i, j, k)
-        #
-        #     # Set the output
-        #     _f = functional(f_0, index)
-        #
-        #     # Write the output
-        #     for l in range(self.velocity_set.q):
-        #         f_1[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
-
         return functional, None
 
     @Operator.register_backend(ComputeBackend.NEON)

From 3f8acb06fbaedb4d22a71b0174dfb70933502811 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Wed, 25 Jun 2025 10:30:27 -0400
Subject: [PATCH 096/208] Added neon implementations of MomentumTransfer for
 dense and multires

---
 xlb/operator/force/__init__.py                |  1 +
 xlb/operator/force/momentum_transfer.py       | 59 ++++++++++++
 .../force/multires_momentum_transfer.py       | 90 +++++++++++++++++++
 3 files changed, 150 insertions(+)
 create mode 100644 xlb/operator/force/multires_momentum_transfer.py

diff --git a/xlb/operator/force/__init__.py b/xlb/operator/force/__init__.py
index ba8a13c3..f3ceec57 100644
--- a/xlb/operator/force/__init__.py
+++ b/xlb/operator/force/__init__.py
@@ -1,2 +1,3 @@
 from xlb.operator.force.momentum_transfer import MomentumTransfer
 from xlb.operator.force.exact_difference_force import ExactDifference
+from xlb.operator.force.multires_momentum_transfer import MultiresMomentumTransfer
diff --git a/xlb/operator/force/momentum_transfer.py b/xlb/operator/force/momentum_transfer.py
index 9784ef18..4258ffaf 100644
--- a/xlb/operator/force/momentum_transfer.py
+++ b/xlb/operator/force/momentum_transfer.py
@@ -9,6 +9,7 @@
 from xlb.compute_backend import ComputeBackend
 from xlb.operator.operator import Operator
 from xlb.operator.stream import Stream
+import neon
 
 
 class MomentumTransfer(Operator):
@@ -189,3 +190,61 @@ def warp_implementation(self, f_0, f_1, bc_mask, missing_mask):
             dim=f_0.shape[1:],
         )
         return force.numpy()[0]
+
+    def _construct_neon(self):
+        # Use the warp functional for the NEON backend
+        functional, _ = self._construct_warp()
+
+        @neon.Container.factory(name="MomentumTransfer")
+        def container(
+            f_0: Any,
+            f_1: Any,
+            bc_mask: Any,
+            missing_mask: Any,
+            force: Any,
+        ):
+            def container_launcher(loader: neon.Loader):
+                loader.set_grid(bc_mask.get_grid())
+                bc_mask_pn = loader.get_write_handle(bc_mask)
+                missing_mask_pn = loader.get_write_handle(missing_mask)
+                f_0_pn = loader.get_write_handle(f_0)
+                f_1_pn = loader.get_write_handle(f_1)
+
+                @wp.func
+                def container_kernel(index: Any):
+                    # apply the functional
+                    functional(
+                        index,
+                        f_0_pn,
+                        f_1_pn,
+                        bc_mask_pn,
+                        missing_mask_pn,
+                        force,
+                    )
+
+                loader.declare_kernel(container_kernel)
+
+            return container_launcher
+
+        return functional, container
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(
+        self,
+        f_0,
+        f_1,
+        bc_mask,
+        missing_mask,
+        stream=0,
+    ):
+        # Allocate the force vector (the total integral value will be computed)
+        _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
+        force = wp.zeros((1), dtype=_u_vec)
+
+        # Define the warp functional for streaming operation
+        self.stream_functional = self.stream.neon_functional
+
+        # Launch the neon container
+        c = self.neon_container(f_0, f_1, bc_mask, missing_mask, force)
+        c.run(stream, container_runtime=neon.Container.ContainerRuntime.neon)
+        return force.numpy()[0]
diff --git a/xlb/operator/force/multires_momentum_transfer.py b/xlb/operator/force/multires_momentum_transfer.py
new file mode 100644
index 00000000..1aebb161
--- /dev/null
+++ b/xlb/operator/force/multires_momentum_transfer.py
@@ -0,0 +1,90 @@
+from typing import Any
+
+import warp as wp
+import neon
+
+from xlb.velocity_set.velocity_set import VelocitySet
+from xlb.precision_policy import PrecisionPolicy
+from xlb.compute_backend import ComputeBackend
+from xlb.operator.operator import Operator
+from xlb.operator.force import MomentumTransfer
+
+
+class MultiresMomentumTransfer(MomentumTransfer):
+    """
+    Multiresolution Momentum Transfer operator for computing the force on a multiresolution grid.
+    This operator computes uses the same approach as its parent class for computing the forces.
+    """
+
+    def __init__(
+        self,
+        no_slip_bc_instance,
+        velocity_set: VelocitySet = None,
+        precision_policy: PrecisionPolicy = None,
+        compute_backend: ComputeBackend = None,
+    ):
+        # Call super
+        super().__init__(no_slip_bc_instance, velocity_set, precision_policy, compute_backend)
+        if self.compute_backend in [ComputeBackend.JAX, ComputeBackend.WARP]:
+            raise NotImplementedError(f"Operator {self.__class__.__name__} not supported in {self.compute_backend} backend.")
+
+    def _construct_neon(self):
+        # Use the warp functional for the NEON backend
+        functional, _ = self._construct_warp()
+
+        @neon.Container.factory(name="MomentumTransfer")
+        def container(
+            f_0: Any,
+            f_1: Any,
+            bc_mask: Any,
+            missing_mask: Any,
+            force: Any,
+            level: Any,
+        ):
+            def container_launcher(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask.get_grid(), level)
+                bc_mask_pn = loader.get_mres_write_handle(bc_mask)
+                missing_mask_pn = loader.get_mres_write_handle(missing_mask)
+                f_0_pn = loader.get_mres_write_handle(f_0)
+                f_1_pn = loader.get_mres_write_handle(f_1)
+
+                @wp.func
+                def container_kernel(index: Any):
+                    # apply the functional
+                    functional(
+                        index,
+                        f_0_pn,
+                        f_1_pn,
+                        bc_mask_pn,
+                        missing_mask_pn,
+                        force,
+                    )
+
+                loader.declare_kernel(container_kernel)
+
+            return container_launcher
+
+        return functional, container
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(
+        self,
+        f_0,
+        f_1,
+        bc_mask,
+        missing_mask,
+        stream=0,
+    ):
+        # Allocate the force vector (the total integral value will be computed)
+        _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
+        force = wp.zeros((1), dtype=_u_vec)
+
+        # Define the warp functional for streaming operation
+        self.stream_functional = self.stream.neon_functional
+
+        grid = bc_mask.get_grid()
+        for level in range(grid.num_levels):
+            # Launch the neon container
+            c = self.neon_container(f_0, f_1, bc_mask, missing_mask, force, level)
+            c.run(stream, container_runtime=neon.Container.ContainerRuntime.neon)
+        return force.numpy()[0]

From 9bedb756a8389e3a2088c3ce66d1f46e50d208f0 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Wed, 25 Jun 2025 11:07:58 -0400
Subject: [PATCH 097/208] Moved no-slip bc functional also outside to the warp
 and neon construct

---
 .../stl_flow_past_sphere_3d.py                | 32 +++++++++++++++----
 xlb/operator/force/momentum_transfer.py       | 10 +++---
 .../force/multires_momentum_transfer.py       |  3 +-
 3 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
index 33398a6c..afe9ef85 100644
--- a/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
@@ -1,3 +1,8 @@
+import neon
+import warp as wp
+import numpy as np
+import time
+
 import xlb
 from xlb.compute_backend import ComputeBackend
 from xlb.precision_policy import PrecisionPolicy
@@ -13,11 +18,8 @@
     HybridBC,
 )
 from xlb.utils import make_cuboid_mesh
-import neon
-import warp as wp
-import numpy as np
-import time
 from xlb.operator.boundary_masker import MeshVoxelizationMethod
+from xlb.operator.force import MultiresMomentumTransfer
 
 
 def generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part, grid_shape):
@@ -85,7 +87,7 @@ def generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part, grid_shape
         level_data.append(level)
         level_origins.append(box_origin)
 
-    return level_data, level_origins, mesh_vertices
+    return level_data, level_origins, mesh_vertices, sphere_radius
 
 
 # -------------------------- Simulation Setup --------------------------
@@ -112,7 +114,7 @@ def generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part, grid_shape
 
 # Generate the cuboid mesh and sphere vertices
 stl_filename = "examples/cfd/stl-files/sphere.stl"
-level_data, level_origins, sphere = generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part, grid_shape)
+level_data, level_origins, sphere, sphere_radius = generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part, grid_shape)
 
 # get the number of levels
 num_levels = len(level_data)
@@ -192,6 +194,23 @@ def bc_profile_warp(index: wp.vec3i):
     collision_type="KBC",
 )
 
+# Setup Momentum Transfer for Force Calculation
+bc_sphre = boundary_conditions[-1]
+momentum_transfer = MultiresMomentumTransfer(bc_sphere, compute_backend=compute_backend)
+
+
+def print_lift_drag(sim):
+    # Compute lift and drag
+    boundary_force = momentum_transfer(sim.f_0, sim.f_1, sim.bc_mask, sim.missing_mask)
+    drag = boundary_force[0]  # x-direction
+    lift = boundary_force[2]
+    sphere_cross_section = np.pi * sphere_radius**2
+    u_avg = 0.5 * u_max
+    cd = 2.0 * drag / (u_avg**2 * sphere_cross_section)
+    cl = 2.0 * lift / (u_avg**2 * sphere_cross_section)
+    print(f"CD={cd}, CL={cl}")
+
+
 # -------------------------- Simulation Loop --------------------------
 
 wp.synchronize()
@@ -202,6 +221,7 @@ def bc_profile_warp(index: wp.vec3i):
     if step % post_process_interval == 0 or step == num_steps - 1:
         # TODO: Issues in the vtk output for rectangular cuboids (as if a duboid grid with the largest side is assumed)
         sim.export_macroscopic("multires_flow_over_sphere_3d_")
+        print_lift_drag(sim)
         wp.synchronize()
         end_time = time.time()
         elapsed = end_time - start_time
diff --git a/xlb/operator/force/momentum_transfer.py b/xlb/operator/force/momentum_transfer.py
index 4258ffaf..13027c6d 100644
--- a/xlb/operator/force/momentum_transfer.py
+++ b/xlb/operator/force/momentum_transfer.py
@@ -134,7 +134,7 @@ def functional(
                 # Apply streaming (pull method)
                 timestep = 0
                 f_post_stream = self.stream_functional(f_0, index)
-                f_post_stream = self.no_slip_bc_instance.warp_functional(index, timestep, _missing_mask, f_0, f_1, f_post_collision, f_post_stream)
+                f_post_stream = self.no_slip_bc_functional(index, timestep, _missing_mask, f_0, f_1, f_post_collision, f_post_stream)
 
                 # Compute the momentum transfer
                 for d in range(self.velocity_set.d):
@@ -172,7 +172,7 @@ def kernel(
                 force,
             )
 
-        return None, kernel
+        return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(self, f_0, f_1, bc_mask, missing_mask):
@@ -180,8 +180,9 @@ def warp_implementation(self, f_0, f_1, bc_mask, missing_mask):
         _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
         force = wp.zeros((1), dtype=_u_vec)
 
-        # Define the warp functional for streaming operation
+        # Define the warp functionals needed for this operation
         self.stream_functional = self.stream.warp_functional
+        self.no_slip_bc_functional = self.no_slip_bc_instance.warp_functional
 
         # Launch the warp kernel
         wp.launch(
@@ -241,8 +242,9 @@ def neon_implementation(
         _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
         force = wp.zeros((1), dtype=_u_vec)
 
-        # Define the warp functional for streaming operation
+        # Define the neon functionals needed for this operation
         self.stream_functional = self.stream.neon_functional
+        self.no_slip_bc_functional = self.no_slip_bc_instance.neon_functional
 
         # Launch the neon container
         c = self.neon_container(f_0, f_1, bc_mask, missing_mask, force)
diff --git a/xlb/operator/force/multires_momentum_transfer.py b/xlb/operator/force/multires_momentum_transfer.py
index 1aebb161..7850a20e 100644
--- a/xlb/operator/force/multires_momentum_transfer.py
+++ b/xlb/operator/force/multires_momentum_transfer.py
@@ -79,8 +79,9 @@ def neon_implementation(
         _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
         force = wp.zeros((1), dtype=_u_vec)
 
-        # Define the warp functional for streaming operation
+        # Define the neon functionals needed for this operation
         self.stream_functional = self.stream.neon_functional
+        self.no_slip_bc_functional = self.no_slip_bc_instance.neon_functional
 
         grid = bc_mask.get_grid()
         for level in range(grid.num_levels):

From 7d1a7c1b587a634a1dd48900f07572ca8e18a236 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Wed, 25 Jun 2025 14:33:34 -0400
Subject: [PATCH 098/208] fixed some operators default values

---
 xlb/operator/boundary_masker/aabb.py                        | 6 +++---
 xlb/operator/boundary_masker/aabb_fill.py                   | 6 +++---
 xlb/operator/boundary_masker/mesh_boundary_masker.py        | 6 +++---
 xlb/operator/boundary_masker/multires_aabb.py               | 6 +++---
 .../boundary_masker/multires_indices_boundary_masker.py     | 6 +++---
 xlb/operator/boundary_masker/ray.py                         | 6 +++---
 xlb/operator/boundary_masker/winding.py                     | 6 +++---
 7 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/xlb/operator/boundary_masker/aabb.py b/xlb/operator/boundary_masker/aabb.py
index 375fbd76..f0f562b3 100644
--- a/xlb/operator/boundary_masker/aabb.py
+++ b/xlb/operator/boundary_masker/aabb.py
@@ -19,9 +19,9 @@ class MeshMaskerAABB(MeshBoundaryMasker):
 
     def __init__(
         self,
-        velocity_set: VelocitySet,
-        precision_policy: PrecisionPolicy,
-        compute_backend: ComputeBackend.WARP,
+        velocity_set: VelocitySet = None,
+        precision_policy: PrecisionPolicy = None,
+        compute_backend: ComputeBackend = None,
     ):
         # Call super
         super().__init__(velocity_set, precision_policy, compute_backend)
diff --git a/xlb/operator/boundary_masker/aabb_fill.py b/xlb/operator/boundary_masker/aabb_fill.py
index a0f932a6..6386546d 100644
--- a/xlb/operator/boundary_masker/aabb_fill.py
+++ b/xlb/operator/boundary_masker/aabb_fill.py
@@ -18,9 +18,9 @@ class MeshMaskerAABBFill(MeshBoundaryMasker):
 
     def __init__(
         self,
-        velocity_set: VelocitySet,
-        precision_policy: PrecisionPolicy,
-        compute_backend: ComputeBackend.WARP,
+        velocity_set: VelocitySet = None,
+        precision_policy: PrecisionPolicy = None,
+        compute_backend: ComputeBackend = None,
         fill_in_voxels: int = 3,
     ):
         # Call super
diff --git a/xlb/operator/boundary_masker/mesh_boundary_masker.py b/xlb/operator/boundary_masker/mesh_boundary_masker.py
index e7f3f488..aa983eb2 100644
--- a/xlb/operator/boundary_masker/mesh_boundary_masker.py
+++ b/xlb/operator/boundary_masker/mesh_boundary_masker.py
@@ -17,9 +17,9 @@ class MeshBoundaryMasker(Operator):
 
     def __init__(
         self,
-        velocity_set: VelocitySet,
-        precision_policy: PrecisionPolicy,
-        compute_backend: ComputeBackend.WARP,
+        velocity_set: VelocitySet = None,
+        precision_policy: PrecisionPolicy = None,
+        compute_backend: ComputeBackend = None,
     ):
         # Call super
         super().__init__(velocity_set, precision_policy, compute_backend)
diff --git a/xlb/operator/boundary_masker/multires_aabb.py b/xlb/operator/boundary_masker/multires_aabb.py
index b5367b9f..d7633c07 100644
--- a/xlb/operator/boundary_masker/multires_aabb.py
+++ b/xlb/operator/boundary_masker/multires_aabb.py
@@ -19,9 +19,9 @@ class MultiresMeshMaskerAABB(MeshMaskerAABB):
 
     def __init__(
         self,
-        velocity_set: VelocitySet,
-        precision_policy: PrecisionPolicy,
-        compute_backend: ComputeBackend.WARP,
+        velocity_set: VelocitySet = None,
+        precision_policy: PrecisionPolicy = None,
+        compute_backend: ComputeBackend = None,
     ):
         # Call super
         super().__init__(velocity_set, precision_policy, compute_backend)
diff --git a/xlb/operator/boundary_masker/multires_indices_boundary_masker.py b/xlb/operator/boundary_masker/multires_indices_boundary_masker.py
index 6a5c53d5..c9e1e1a0 100644
--- a/xlb/operator/boundary_masker/multires_indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/multires_indices_boundary_masker.py
@@ -19,9 +19,9 @@ class MultiresIndicesBoundaryMasker(IndicesBoundaryMasker):
 
     def __init__(
         self,
-        velocity_set: VelocitySet,
-        precision_policy: PrecisionPolicy,
-        compute_backend: ComputeBackend.WARP,
+        velocity_set: VelocitySet = None,
+        precision_policy: PrecisionPolicy = None,
+        compute_backend: ComputeBackend = None,
     ):
         # Call super
         super().__init__(velocity_set, precision_policy, compute_backend)
diff --git a/xlb/operator/boundary_masker/ray.py b/xlb/operator/boundary_masker/ray.py
index a1d83580..b1e895f0 100644
--- a/xlb/operator/boundary_masker/ray.py
+++ b/xlb/operator/boundary_masker/ray.py
@@ -14,9 +14,9 @@ class MeshMaskerRay(MeshBoundaryMasker):
 
     def __init__(
         self,
-        velocity_set: VelocitySet,
-        precision_policy: PrecisionPolicy,
-        compute_backend: ComputeBackend.WARP,
+        velocity_set: VelocitySet = None,
+        precision_policy: PrecisionPolicy = None,
+        compute_backend: ComputeBackend = None,
     ):
         # Call super
         super().__init__(velocity_set, precision_policy, compute_backend)
diff --git a/xlb/operator/boundary_masker/winding.py b/xlb/operator/boundary_masker/winding.py
index 328e528d..9e1ae722 100644
--- a/xlb/operator/boundary_masker/winding.py
+++ b/xlb/operator/boundary_masker/winding.py
@@ -14,9 +14,9 @@ class MeshMaskerWinding(MeshBoundaryMasker):
 
     def __init__(
         self,
-        velocity_set: VelocitySet,
-        precision_policy: PrecisionPolicy,
-        compute_backend: ComputeBackend.WARP,
+        velocity_set: VelocitySet = None,
+        precision_policy: PrecisionPolicy = None,
+        compute_backend: ComputeBackend = None,
     ):
         # Call super
         super().__init__(velocity_set, precision_policy, compute_backend)

From 930253f3447674b736ba7866e9366437e588593d Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Thu, 26 Jun 2025 11:38:19 -0400
Subject: [PATCH 099/208] moved force to be a member of the class with a single
 pointer. Otherwise the pointers would be recreated at every launch and that
 was not captured properly by the neon implementation

---
 xlb/operator/force/momentum_transfer.py       | 22 ++++++++++---------
 .../force/multires_momentum_transfer.py       |  9 ++++----
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/xlb/operator/force/momentum_transfer.py b/xlb/operator/force/momentum_transfer.py
index 13027c6d..3b0effc6 100644
--- a/xlb/operator/force/momentum_transfer.py
+++ b/xlb/operator/force/momentum_transfer.py
@@ -49,6 +49,10 @@ def __init__(
             compute_backend,
         )
 
+        # Allocate the force vector (the total integral value will be computed)
+        _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
+        self.force = wp.zeros((1), dtype=_u_vec)
+
     @Operator.register_backend(ComputeBackend.JAX)
     @partial(jit, static_argnums=(0))
     def jax_implementation(self, f_0, f_1, bc_mask, missing_mask):
@@ -176,9 +180,8 @@ def kernel(
 
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(self, f_0, f_1, bc_mask, missing_mask):
-        # Allocate the force vector (the total integral value will be computed)
-        _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
-        force = wp.zeros((1), dtype=_u_vec)
+        # Ensure the force is initialized to zero
+        self.force *= 0.0
 
         # Define the warp functionals needed for this operation
         self.stream_functional = self.stream.warp_functional
@@ -187,10 +190,10 @@ def warp_implementation(self, f_0, f_1, bc_mask, missing_mask):
         # Launch the warp kernel
         wp.launch(
             self.warp_kernel,
-            inputs=[f_0, f_1, bc_mask, missing_mask, force],
+            inputs=[f_0, f_1, bc_mask, missing_mask, self.force],
             dim=f_0.shape[1:],
         )
-        return force.numpy()[0]
+        return self.force.numpy()[0]
 
     def _construct_neon(self):
         # Use the warp functional for the NEON backend
@@ -238,15 +241,14 @@ def neon_implementation(
         missing_mask,
         stream=0,
     ):
-        # Allocate the force vector (the total integral value will be computed)
-        _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
-        force = wp.zeros((1), dtype=_u_vec)
+        # Ensure the force is initialized to zero
+        self.force *= 0.0
 
         # Define the neon functionals needed for this operation
         self.stream_functional = self.stream.neon_functional
         self.no_slip_bc_functional = self.no_slip_bc_instance.neon_functional
 
         # Launch the neon container
-        c = self.neon_container(f_0, f_1, bc_mask, missing_mask, force)
+        c = self.neon_container(f_0, f_1, bc_mask, missing_mask, self.force)
         c.run(stream, container_runtime=neon.Container.ContainerRuntime.neon)
-        return force.numpy()[0]
+        return self.force.numpy()[0]
diff --git a/xlb/operator/force/multires_momentum_transfer.py b/xlb/operator/force/multires_momentum_transfer.py
index 7850a20e..e9b12b0a 100644
--- a/xlb/operator/force/multires_momentum_transfer.py
+++ b/xlb/operator/force/multires_momentum_transfer.py
@@ -75,9 +75,8 @@ def neon_implementation(
         missing_mask,
         stream=0,
     ):
-        # Allocate the force vector (the total integral value will be computed)
-        _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
-        force = wp.zeros((1), dtype=_u_vec)
+        # Ensure the force is initialized to zero
+        self.force *= 0.0
 
         # Define the neon functionals needed for this operation
         self.stream_functional = self.stream.neon_functional
@@ -86,6 +85,6 @@ def neon_implementation(
         grid = bc_mask.get_grid()
         for level in range(grid.num_levels):
             # Launch the neon container
-            c = self.neon_container(f_0, f_1, bc_mask, missing_mask, force, level)
+            c = self.neon_container(f_0, f_1, bc_mask, missing_mask, self.force, level)
             c.run(stream, container_runtime=neon.Container.ContainerRuntime.neon)
-        return force.numpy()[0]
+        return self.force.numpy()[0]

From e91247cd2ed158ae6c584f18e5e8120ecaf0c1a2 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Thu, 26 Jun 2025 17:48:01 -0400
Subject: [PATCH 100/208] minor todo items done

---
 xlb/operator/boundary_condition/boundary_condition.py | 4 ++--
 xlb/operator/force/momentum_transfer.py               | 6 +++---
 xlb/operator/stepper/nse_multires_stepper.py          | 6 ++----
 xlb/operator/stepper/nse_stepper.py                   | 6 ++----
 4 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/xlb/operator/boundary_condition/boundary_condition.py b/xlb/operator/boundary_condition/boundary_condition.py
index be780341..d99692c2 100644
--- a/xlb/operator/boundary_condition/boundary_condition.py
+++ b/xlb/operator/boundary_condition/boundary_condition.py
@@ -212,7 +212,7 @@ def _construct_aux_data_init_container(self, functional):
         _opp_indices = self.velocity_set.opp_indices
         _num_of_aux_data = self.num_of_aux_data
 
-        # Find velocity index for 0, 0, 0
+        # Find velocity index for (0, 0, 0)
         lattice_central_index = self.velocity_set.center_index
 
         # Construct the Neon container
@@ -297,7 +297,7 @@ def _construct_multires_aux_data_init_container(self, functional):
         _opp_indices = self.velocity_set.opp_indices
         _num_of_aux_data = self.num_of_aux_data
 
-        # Find velocity index for 0, 0, 0
+        # Find velocity index for (0, 0, 0)
         lattice_central_index = self.velocity_set.center_index
 
         # Construct the Neon container
diff --git a/xlb/operator/force/momentum_transfer.py b/xlb/operator/force/momentum_transfer.py
index 3b0effc6..97ba8df3 100644
--- a/xlb/operator/force/momentum_transfer.py
+++ b/xlb/operator/force/momentum_transfer.py
@@ -95,15 +95,15 @@ def jax_implementation(self, f_0, f_1, bc_mask, missing_mask):
         return force_net
 
     def _construct_warp(self):
-        # Set local constants TODO: This is a hack and should be fixed with warp update
+        # Set local constants
         _c = self.velocity_set.c
         _opp_indices = self.velocity_set.opp_indices
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
         _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
-        _missing_mask_vec = wp.vec(self.velocity_set.q, dtype=wp.uint8)  # TODO fix vec bool
+        _missing_mask_vec = wp.vec(self.velocity_set.q, dtype=wp.uint8)
         _no_slip_id = self.no_slip_bc_instance.id
 
-        # Find velocity index for 0, 0, 0
+        # Find velocity index for (0, 0, 0)
         lattice_central_index = self.velocity_set.center_index
 
         @wp.func
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index e7896b50..763081ce 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -336,14 +336,12 @@ def neon_apply_aux_recovery_bc(
                             # Perform the swapping of data
                             if l == lattice_central_index:
                                 # (i) Recover the values stored in the central index of f_1
-                                # TODO: Add store dtype
                                 _f1_thread = wp.neon_read(f_1_pn, index, l)
-                                wp.neon_write(f_0_pn, index, l, _f1_thread)
+                                wp.neon_write(f_0_pn, index, l, self.store_dtype(_f1_thread))
                             elif _missing_mask[l] == wp.uint8(1):
                                 # (ii) Recover the values stored in the missing directions of f_1
-                                # TODO: Add store dtype
                                 _f1_thread = wp.neon_read(f_1_pn, index, _opp_indices[l])
-                                wp.neon_write(f_0_pn, index, _opp_indices[l], _f1_thread)
+                                wp.neon_write(f_0_pn, index, _opp_indices[l], self.store_dtype(_f1_thread))
 
         @neon.Container.factory(name="collide_coarse")
         def collide_coarse(
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 5e16f680..e804678e 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -474,14 +474,12 @@ def neon_apply_aux_recovery_bc(
                             # Perform the swapping of data
                             if l == lattice_central_index:
                                 # (i) Recover the values stored in the central index of f_1
-                                # TODO: Add store dtype
                                 _f1_thread = wp.neon_read(f_1_pn, index, l)
-                                wp.neon_write(f_0_pn, index, l, _f1_thread)
+                                wp.neon_write(f_0_pn, index, l, self.store_dtype(_f1_thread))
                             elif _missing_mask[l] == wp.uint8(1):
                                 # (ii) Recover the values stored in the missing directions of f_1
-                                # TODO: Add store dtype
                                 _f1_thread = wp.neon_read(f_1_pn, index, _opp_indices[l])
-                                wp.neon_write(f_0_pn, index, _opp_indices[l], _f1_thread)
+                                wp.neon_write(f_0_pn, index, _opp_indices[l], self.store_dtype(_f1_thread))
 
         @neon.Container.factory(name="nse_stepper")
         def container(

From 21c669df1bca76667c0ed158bd4bdbd4e3c539a4 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 27 Jun 2025 09:28:50 -0400
Subject: [PATCH 101/208] fixed force calculation for multires. Neon results
 seem to be non-deterministic! WIP: handling BC w/ distance!

---
 .../grid_refinement/stl_flow_past_sphere_3d.py  | 16 +++++++++-------
 .../force/multires_momentum_transfer.py         | 17 ++++++++++++++++-
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
index afe9ef85..e1b9ec0f 100644
--- a/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
@@ -72,7 +72,7 @@ def generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part, grid_shape
             level = np.ascontiguousarray(np.ones(shape, dtype=int), dtype=np.int32)
             box_origin = (0, 0, 0)  # The coarsest level has no origin offset
         else:
-            box_size = tuple([int(shape[i] // 4 * growth) for i in range(3)])
+            box_size = tuple([int(0.3 * shape[i] * growth) for i in range(3)])
             if lvl == 0:
                 box_origin = tuple(
                     [sphere_origin[0] // divider - int(2 * growth * sphere_radius // divider)]
@@ -102,8 +102,8 @@ def generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part, grid_shape
 precision_policy = PrecisionPolicy.FP32FP32
 velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=compute_backend)
 u_max = 0.04
-num_steps = 1000
-post_process_interval = 100
+num_steps = 10000
+post_process_interval = 1000
 
 # Initialize XLB
 xlb.init(
@@ -175,10 +175,10 @@ def bc_profile_warp(index: wp.vec3i):
 bc_walls = FullwayBounceBackBC(indices=walls)  # TODO: issues with halfway bounce back only here!
 # bc_outlet = ExtrapolationOutflowBC(indices=outlet)
 bc_outlet = DoNothingBC(indices=outlet)
-# bc_sphere = HalfwayBounceBackBC(mesh_vertices=sphere, voxelization_method=MeshVoxelizationMethod.AABB)
-bc_sphere = HybridBC(
-    bc_method="nonequilibrium_regularized", mesh_vertices=sphere, voxelization_method=MeshVoxelizationMethod.AABB, use_mesh_distance=True
-)
+bc_sphere = HalfwayBounceBackBC(mesh_vertices=sphere, voxelization_method=MeshVoxelizationMethod.AABB)
+# bc_sphere = HybridBC(
+#     bc_method="nonequilibrium_regularized", mesh_vertices=sphere, voxelization_method=MeshVoxelizationMethod.AABB, use_mesh_distance=False
+# )
 
 boundary_conditions = [bc_walls, bc_left, bc_outlet, bc_sphere]
 
@@ -201,7 +201,9 @@ def bc_profile_warp(index: wp.vec3i):
 
 def print_lift_drag(sim):
     # Compute lift and drag
+    wp.synchronize()
     boundary_force = momentum_transfer(sim.f_0, sim.f_1, sim.bc_mask, sim.missing_mask)
+    wp.synchronize()
     drag = boundary_force[0]  # x-direction
     lift = boundary_force[2]
     sphere_cross_section = np.pi * sphere_radius**2
diff --git a/xlb/operator/force/multires_momentum_transfer.py b/xlb/operator/force/multires_momentum_transfer.py
index e9b12b0a..9cc26a12 100644
--- a/xlb/operator/force/multires_momentum_transfer.py
+++ b/xlb/operator/force/multires_momentum_transfer.py
@@ -28,6 +28,15 @@ def __init__(
         if self.compute_backend in [ComputeBackend.JAX, ComputeBackend.WARP]:
             raise NotImplementedError(f"Operator {self.__class__.__name__} not supported in {self.compute_backend} backend.")
 
+        # TODO! The current implementation does not support encoding and decoding of mesh distance in f_1!
+        assert not self.no_slip_bc_instance.needs_mesh_distance, "Mesh distance is not supported for Force Calculation!"
+
+        # Print a warning to the user about the boundary voxels
+        print(
+            "WARNING! make sure boundary voxels are all at the same level and not among the transition regions from one level to another. " \
+            "Otherwise, the results of force calculation are not correct!\n"
+        )
+
     def _construct_neon(self):
         # Use the warp functional for the NEON backend
         functional, _ = self._construct_warp()
@@ -48,13 +57,19 @@ def container_launcher(loader: neon.Loader):
                 f_0_pn = loader.get_mres_write_handle(f_0)
                 f_1_pn = loader.get_mres_write_handle(f_1)
 
+                # Important: Note the swap to the order of f_0 and f_1 in the functional call.
+                # This is because the multiresolution simulation first performs collision and then streaming and hence
+                # f_0 refers to the post-streaming distribution function and f_1 refers to the pre-collision distribution function.
+                # This is in contrast to our dense implementations (all backends) where streaming occurs first and is followed by
+                # collision which makes. As a workaround, we can simply swap f_0 and f_1 in the functional call.
+
                 @wp.func
                 def container_kernel(index: Any):
                     # apply the functional
                     functional(
                         index,
-                        f_0_pn,
                         f_1_pn,
+                        f_0_pn,
                         bc_mask_pn,
                         missing_mask_pn,
                         force,

From 17ccb53ecdad7b9b54022422f1c32e1cc0faedd1 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 4 Jul 2025 11:05:45 -0400
Subject: [PATCH 102/208] Added the sphere example using the cuboid mesher

---
 .../cuboid_flow_past_sphere_3d.py             | 253 ++++++++++++++++++
 xlb/utils/mesher.py                           |   4 +-
 2 files changed, 255 insertions(+), 2 deletions(-)
 create mode 100644 examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py

diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
new file mode 100644
index 00000000..8b6bdd6b
--- /dev/null
+++ b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
@@ -0,0 +1,253 @@
+import neon
+import warp as wp
+import numpy as np
+import time
+
+import xlb
+from xlb.compute_backend import ComputeBackend
+from xlb.precision_policy import PrecisionPolicy
+from xlb.grid import multires_grid_factory
+from xlb.operator.stepper import MultiresIncompressibleNavierStokesStepper
+from xlb.operator.boundary_condition import (
+    FullwayBounceBackBC,
+    HalfwayBounceBackBC,
+    RegularizedBC,
+    ExtrapolationOutflowBC,
+    DoNothingBC,
+    ZouHeBC,
+    HybridBC,
+)
+from xlb.operator.boundary_masker import MeshVoxelizationMethod
+from xlb.utils.mesher import make_cuboid_mesh
+from xlb.operator.force import MultiresMomentumTransfer
+
+
+def generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part):
+    """
+    Generate a cuboid mesh based on the provided voxel size and domain multipliers.
+    """
+    import open3d as o3d
+    import os
+
+    # Domain multipliers for each refinement level
+    # First entry should be full domain size
+    # Domain multipliers
+    domainMultiplier = [
+        [15, 15, 7, 7, 7, 7],  # -x, x, -y, y, -z, z
+        [6, 8, 5, 5, 5, 5],  # -x, x, -y, y, -z, z
+        [4, 6, 4, 4, 4, 4],
+        [2, 4, 2, 2, 2, 2],
+        # [1, 2, 1, 1, 1, 1],
+        # [0.4, 1, 0.4, 0.4, 0.4, 0.4],
+        # [0.2, 0.4, 0.2, 0.2, 0.2, 0.2],
+    ]
+
+    # Load the mesh
+    mesh = o3d.io.read_triangle_mesh(stl_filename)
+    if mesh.is_empty():
+        raise ValueError("Loaded mesh is empty or invalid.")
+
+    # Compute original bounds
+    aabb = mesh.get_axis_aligned_bounding_box()
+    min_bound = aabb.get_min_bound()
+    max_bound = aabb.get_max_bound()
+    partSize = max_bound - min_bound
+
+    # smallest voxel size
+    voxel_size = min(partSize) / num_finest_voxels_across_part
+
+    # Compute translation to put mesh into first octant of that domain—
+    shift = np.array(
+        [
+            domainMultiplier[0][0] * partSize[0] - min_bound[0],
+            domainMultiplier[0][2] * partSize[1] - min_bound[1],
+            domainMultiplier[0][4] * partSize[2] - min_bound[2],
+        ],
+        dtype=float,
+    )
+
+    # Apply translation and save out temp stl
+    mesh.translate(shift)
+    mesh.compute_vertex_normals()
+    mesh_vertices = np.asarray(mesh.vertices) / voxel_size
+    o3d.io.write_triangle_mesh("temp.stl", mesh)
+
+    # Mesh based on temp stl
+    level_data = make_cuboid_mesh(
+        voxel_size,
+        domainMultiplier,
+        "temp.stl",
+    )
+    grid_shape_finest = tuple([i * 2 ** (len(level_data) - 1) for i in level_data[-1][0].shape])
+    print(f"Full shape based on finest voxels size is {grid_shape_finest}")
+    os.remove("temp.stl")
+    return level_data, mesh_vertices, tuple([int(a) for a in grid_shape_finest])
+
+
+def prepare_sparsity_pattern(level_data):
+    """
+    Prepare the sparsity pattern for the multiresolution grid based on the level data. "level_data" is expected to be formatted as in
+    the output of "make_cuboid_mesh".
+    """
+    num_levels = len(level_data)
+    sparsity_pattern = []
+    level_origins = []
+    sparsity_pattern = []
+    for lvl in range(num_levels):
+        # Get the level mask from the level data
+        level_mask = level_data[lvl][0]
+
+        # Ensure level_0 is contiguous int32
+        level_mask = np.ascontiguousarray(level_mask, dtype=np.int32)
+
+        # Append the padded level mask to the sparsity pattern
+        sparsity_pattern.append(level_mask)
+
+        # Get the origin for this level
+        level_origins.append(level_data[lvl][2])
+
+    return sparsity_pattern, level_origins
+
+
+# -------------------------- Simulation Setup --------------------------
+
+# The following parameters define the resolution of the voxelized grid
+sphere_radius = 5
+num_finest_voxels_across_part = 2 * sphere_radius
+
+# Other setup parameters
+Re = 5000.0
+compute_backend = ComputeBackend.NEON
+precision_policy = PrecisionPolicy.FP32FP32
+velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=compute_backend)
+u_max = 0.04
+num_steps = 10000
+post_process_interval = 1000
+
+# Initialize XLB
+xlb.init(
+    velocity_set=velocity_set,
+    default_backend=compute_backend,
+    default_precision_policy=precision_policy,
+)
+
+# Generate the cuboid mesh and sphere vertices
+stl_filename = "examples/cfd/stl-files/sphere.stl"
+level_data, sphere, grid_shape_finest = generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part)
+
+# Prepare the sparsity pattern and origins from the level data
+sparsity_pattern, level_origins = prepare_sparsity_pattern(level_data)
+
+# get the number of levels
+num_levels = len(level_data)
+
+# Create the multires grid
+grid = multires_grid_factory(
+    grid_shape_finest,
+    velocity_set=velocity_set,
+    sparsity_pattern_list=sparsity_pattern,
+    sparsity_pattern_origins=[neon.Index_3d(*box_origin) for box_origin in level_origins],
+)
+
+# Define Boundary Indices
+coarsest_level = grid.count_levels - 1
+box = grid.bounding_box_indices(shape=grid.level_to_shape(coarsest_level))
+box_no_edge = grid.bounding_box_indices(shape=grid.level_to_shape(coarsest_level), remove_edges=True)
+inlet = box_no_edge["left"]
+outlet = box_no_edge["right"]
+walls = [box["bottom"][i] + box["top"][i] + box["front"][i] + box["back"][i] for i in range(velocity_set.d)]
+walls = np.unique(np.array(walls), axis=-1).tolist()
+
+
+# Define Boundary Conditions
+def bc_profile():
+    assert compute_backend == ComputeBackend.NEON
+
+    # Note nx, ny, nz are the dimensions of the grid at the finest level while the inlet is defined at the coarsest level
+    nx, ny, nz = grid_shape_finest
+    H_y = float(ny // 2 ** (num_levels - 1) - 1)  # Height in y direction
+    H_z = float(nz // 2 ** (num_levels - 1) - 1)  # Height in z direction
+
+    @wp.func
+    def bc_profile_warp(index: wp.vec3i):
+        # Poiseuille flow profile: parabolic velocity distribution
+        y = wp.float32(index[1])
+        z = wp.float32(index[2])
+
+        # Calculate normalized distance from center
+        y_center = y - (H_y / 2.0)
+        z_center = z - (H_z / 2.0)
+        r_squared = (2.0 * y_center / H_y) ** 2.0 + (2.0 * z_center / H_z) ** 2.0
+
+        # Parabolic profile: u = u_max * (1 - r²)
+        return wp.vec(u_max * wp.max(0.0, 1.0 - r_squared), length=1)
+
+    return bc_profile_warp
+
+
+# Convert bc indices to a list of list (first entry corresponds to the finest level)
+inlet = [[] for _ in range(num_levels - 1)] + [inlet]
+outlet = [[] for _ in range(num_levels - 1)] + [outlet]
+walls = [[] for _ in range(num_levels - 1)] + [walls]
+
+# Initialize Boundary Conditions
+bc_left = RegularizedBC("velocity", profile=bc_profile(), indices=inlet)
+# Alternatively, use a prescribed velocity profile
+# bc_left = RegularizedBC("velocity", prescribed_value=(u_max, 0.0, 0.0), indices=inlet)
+bc_walls = FullwayBounceBackBC(indices=walls)  # TODO: issues with halfway bounce back only here!
+# bc_outlet = ExtrapolationOutflowBC(indices=outlet)
+bc_outlet = DoNothingBC(indices=outlet)
+bc_sphere = HalfwayBounceBackBC(mesh_vertices=sphere, voxelization_method=MeshVoxelizationMethod.AABB)
+# bc_sphere = HybridBC(
+#     bc_method="nonequilibrium_regularized", mesh_vertices=sphere, voxelization_method=MeshVoxelizationMethod.AABB, use_mesh_distance=False
+# )
+
+boundary_conditions = [bc_walls, bc_left, bc_outlet, bc_sphere]
+
+# Configure the simulation relaxation time
+visc = u_max * num_finest_voxels_across_part / Re
+omega = 1.0 / (3.0 * visc + 0.5)
+
+# Define a multi-resolution simulation manager
+sim = xlb.helper.MultiresSimulationManager(
+    omega=omega,
+    grid=grid,
+    boundary_conditions=boundary_conditions,
+    collision_type="KBC",
+)
+
+# Setup Momentum Transfer for Force Calculation
+bc_sphre = boundary_conditions[-1]
+momentum_transfer = MultiresMomentumTransfer(bc_sphere, compute_backend=compute_backend)
+
+
+def print_lift_drag(sim):
+    # Compute lift and drag
+    wp.synchronize()
+    boundary_force = momentum_transfer(sim.f_0, sim.f_1, sim.bc_mask, sim.missing_mask)
+    wp.synchronize()
+    drag = boundary_force[0]  # x-direction
+    lift = boundary_force[2]
+    sphere_cross_section = np.pi * sphere_radius**2
+    u_avg = 0.5 * u_max
+    cd = 2.0 * drag / (u_avg**2 * sphere_cross_section)
+    cl = 2.0 * lift / (u_avg**2 * sphere_cross_section)
+    print(f"CD={cd}, CL={cl}")
+
+
+# -------------------------- Simulation Loop --------------------------
+
+wp.synchronize()
+start_time = time.time()
+for step in range(num_steps):
+    sim.step()
+
+    if step % post_process_interval == 0 or step == num_steps - 1:
+        # TODO: Issues in the vtk output for rectangular cuboids (as if a duboid grid with the largest side is assumed)
+        sim.export_macroscopic("multires_flow_over_sphere_3d_")
+        print_lift_drag(sim)
+        wp.synchronize()
+        end_time = time.time()
+        elapsed = end_time - start_time
+        print(f"Completed step {step}. Time elapsed for {post_process_interval} steps: {elapsed:.6f} seconds.")
+        start_time = time.time()
diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index ed6f971b..45072440 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -116,7 +116,7 @@ def make_cuboid_mesh(voxel_size, cuboids, stl_name):
         voxel_matrix_k[i_start:i_end, j_start:j_end, k_start:k_end] = 0
 
     # Step 3 Convert to Indices from STL units
-    level_data = [(dr, int(v / voxel_size), np.round(dOrigin / voxel_size).astype(int), l) for dr, v, dOrigin, l in level_data]
+    num_levels = len(level_data)
+    level_data = [(dr, int(v / voxel_size), np.round(dOrigin / v).astype(int), num_levels - 1 - l) for dr, v, dOrigin, l in level_data]
 
-    # Reverse to have finest level first
     return list(reversed(level_data))

From a4f7b7208caadf8ce6e256454e66becf4b7efa17 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 4 Jul 2025 11:21:12 -0400
Subject: [PATCH 103/208] neon_get_type does not work. reverting back to
 hard-coded solution for now.

---
 xlb/operator/force/multires_momentum_transfer.py | 2 +-
 xlb/operator/operator.py                         | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/xlb/operator/force/multires_momentum_transfer.py b/xlb/operator/force/multires_momentum_transfer.py
index 9cc26a12..92e297c9 100644
--- a/xlb/operator/force/multires_momentum_transfer.py
+++ b/xlb/operator/force/multires_momentum_transfer.py
@@ -33,7 +33,7 @@ def __init__(
 
         # Print a warning to the user about the boundary voxels
         print(
-            "WARNING! make sure boundary voxels are all at the same level and not among the transition regions from one level to another. " \
+            "WARNING! make sure boundary voxels are all at the same level and not among the transition regions from one level to another. "
             "Otherwise, the results of force calculation are not correct!\n"
         )
 
diff --git a/xlb/operator/operator.py b/xlb/operator/operator.py
index 52e9740a..25198c03 100644
--- a/xlb/operator/operator.py
+++ b/xlb/operator/operator.py
@@ -246,8 +246,8 @@ def read_field_neighbor(
             ):
                 # This function reads a field value at a given neighboring index and direction.
                 unused_is_valid = wp.bool(False)
-                dtype = neon_get_type(field)  # This is a placeholder to ensure the dtype is set correctly
-                return wp.neon_read_ngh(field, index, offset, direction, dtype(0.0), unused_is_valid)
+                # dtype = neon_get_type(field)  # This is a placeholder to ensure the dtype is set correctly
+                return wp.neon_read_ngh(field, index, offset, direction, wp.uint8(0.0), unused_is_valid)
 
         else:
             raise ValueError(f"Unsupported compute backend: {self.compute_backend}")

From ff12d92d6fde1df9ade5f3b2a70a88fc2b34f7f8 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 4 Jul 2025 11:22:04 -0400
Subject: [PATCH 104/208] Added ExportMultiresHDF5 as a new class in mesher
 with Neon container implementation

---
 .../cuboid_flow_past_sphere_3d.py             |  17 +-
 xlb/utils/__init__.py                         |   2 +-
 xlb/utils/mesher.py                           | 321 ++++++++++++++++++
 3 files changed, 337 insertions(+), 3 deletions(-)

diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
index 8b6bdd6b..a75d86d4 100644
--- a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
@@ -135,6 +135,12 @@ def prepare_sparsity_pattern(level_data):
 stl_filename = "examples/cfd/stl-files/sphere.stl"
 level_data, sphere, grid_shape_finest = generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part)
 
+
+# Define exporter object for hdf5 output
+from xlb.utils import ExportMultiresHDF5
+
+h5exporter = ExportMultiresHDF5(level_data)
+
 # Prepare the sparsity pattern and origins from the level data
 sparsity_pattern, level_origins = prepare_sparsity_pattern(level_data)
 
@@ -243,8 +249,15 @@ def print_lift_drag(sim):
     sim.step()
 
     if step % post_process_interval == 0 or step == num_steps - 1:
-        # TODO: Issues in the vtk output for rectangular cuboids (as if a duboid grid with the largest side is assumed)
-        sim.export_macroscopic("multires_flow_over_sphere_3d_")
+        # Call the Macroscopic operator to compute macroscopic fields
+        sim.macro(sim.f_0, sim.bc_mask, sim.rho, sim.u, streamId=0)
+        wp.synchronize()
+
+        # Call the exporter to save the current state
+        filename = f"multires_flow_over_sphere_3d_step_{step:04d}"
+        h5exporter(filename, sim.u, sim.rho, compression="gzip", compression_opts=2)
+
+        # Print lift and drag coefficients
         print_lift_drag(sim)
         wp.synchronize()
         end_time = time.time()
diff --git a/xlb/utils/__init__.py b/xlb/utils/__init__.py
index cb6c35cb..51c97c4c 100644
--- a/xlb/utils/__init__.py
+++ b/xlb/utils/__init__.py
@@ -7,4 +7,4 @@
     voxelize_stl,
     axangle2mat,
 )
-from .mesher import make_cuboid_mesh
+from .mesher import make_cuboid_mesh, ExportMultiresHDF5
diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index 45072440..57af17a9 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -120,3 +120,324 @@ def make_cuboid_mesh(voxel_size, cuboids, stl_name):
     level_data = [(dr, int(v / voxel_size), np.round(dOrigin / v).astype(int), num_levels - 1 - l) for dr, v, dOrigin, l in level_data]
 
     return list(reversed(level_data))
+
+
+class ExportMultiresHDF5(object):
+    def __init__(self, levels_data, scale=1, offset=(0.0, 0.0, 0.0)):
+        """
+        Initialize the ExportMultiresHDF5 object.
+
+        Parameters
+        ----------
+        levels_data : list of tuples
+            Each tuple contains (data, voxel_size, origin, level).
+        filename : str
+            The name of the output HDF5 file.
+        fields : dict, optional
+            A dictionary of fields to be included in the HDF5 file.
+        scale : float or tuple, optional
+            Scale factor for the coordinates.
+        offset : tuple, optional
+            Offset to be applied to the coordinates.
+        compression : str, optional
+            Compression method for the HDF5 datasets.
+        compression_opts : int, optional
+            Compression options for the HDF5 datasets.
+        """
+        # Process the multires geometry and extract coordinates and connectivity in the coordinate system of the finest level
+        coordinates, connectivity, level_id_field, total_cells = self.process_geometry(levels_data, scale)
+
+        # Ensure that coordinates and connectivity are not empty
+        assert coordinates.size != 0, "Error: No valid data to process. Check the input levels_data."
+
+        # Merge duplicate points
+        coordinates, connectivity = self._merge_duplicates(coordinates, connectivity)
+
+        # Apply scale and offset
+        coordinates = self._transform_coordinates(coordinates, scale, offset)
+
+        # Assign to self
+        self.levels_data = levels_data
+        self.coordinates = coordinates
+        self.connectivity = connectivity
+        self.level_id_field = level_id_field
+        self.total_cells = total_cells
+
+    def process_geometry(self, levels_data, scale):
+        num_voxels_per_level = [np.sum(data) for data, _, _, _ in levels_data]
+        num_points_per_level = [8 * nv for nv in num_voxels_per_level]
+        point_id_offsets = np.cumsum([0] + num_points_per_level[:-1])
+
+        all_corners = []
+        all_connectivity = []
+        level_id_field = []
+        total_cells = 0
+
+        for level_idx, (data, voxel_size, origin, level) in enumerate(levels_data):
+            origin = origin * voxel_size
+            corners_list, conn_list, _ = self._process_level(data, voxel_size, origin, level, point_id_offsets[level_idx])
+
+            if corners_list:
+                print(f"\tProcessing level {level}: Voxel size {voxel_size * scale}, Origin {origin}, Shape {data.shape}")
+                all_corners.extend(corners_list)
+                all_connectivity.extend(conn_list)
+                num_cells = sum(c.shape[0] for c in conn_list)
+                level_id_field.extend([level] * num_cells)
+                total_cells += num_cells
+            else:
+                print(f"\tSkipping level {level} (no unique data)")
+
+        # Stacking coordinates and connectivity
+        coordinates = np.concatenate(all_corners, axis=0).astype(np.float32)
+        connectivity = np.concatenate(all_connectivity, axis=0).astype(np.int32)
+        level_id_field = np.array(level_id_field, dtype=np.uint8)
+
+        return coordinates, connectivity, level_id_field, total_cells
+
+    def _process_level(self, data, voxel_size, origin, level, point_id_offset):
+        """
+        Given a voxel grid, returns all corners and connectivity in NumPy for this resolution level.
+        """
+        true_indices = np.argwhere(data)
+        if true_indices.size == 0:
+            return [], [], level
+
+        max_voxels_per_chunk = 268_435_450
+        chunks = np.array_split(true_indices, max(1, (len(true_indices) + max_voxels_per_chunk - 1) // max_voxels_per_chunk))
+
+        all_corners = []
+        all_connectivity = []
+        pid_offset = point_id_offset
+
+        for chunk in chunks:
+            if chunk.size == 0:
+                continue
+            corners, connectivity = self._process_voxel_chunk(chunk, np.asarray(origin, dtype=np.float32), voxel_size, pid_offset)
+            all_corners.append(corners)
+            all_connectivity.append(connectivity)
+            pid_offset += len(chunk) * 8
+
+        return all_corners, all_connectivity, level
+
+    def _process_voxel_chunk(self, true_indices, origin, voxel_size, point_id_offset):
+        """
+        Given a set of voxel indices, returns 8 corners and connectivity for each cube using NumPy.
+        """
+        true_indices = np.asarray(true_indices, dtype=np.float32)
+        mins = origin + true_indices * voxel_size
+        offsets = np.array(
+            [
+                [0, 0, 0],
+                [1, 0, 0],
+                [1, 1, 0],
+                [0, 1, 0],
+                [0, 0, 1],
+                [1, 0, 1],
+                [1, 1, 1],
+                [0, 1, 1],
+            ],
+            dtype=np.float32,
+        )
+
+        corners = (mins[:, None, :] + offsets[None, :, :] * voxel_size).reshape(-1, 3).astype(np.float32)
+        base_ids = point_id_offset + np.arange(len(true_indices), dtype=np.int32) * 8
+        connectivity = (base_ids[:, None] + np.arange(8, dtype=np.int32)).astype(np.int32)
+
+        return corners, connectivity
+
+    def save_xdmf(self, h5_filename, xmf_filename, total_cells, num_points, fields={}):
+        # Generate an XDMF file to accompany the HDF5 file
+        print(f"\tGenerating XDMF file: {xmf_filename}")
+        hdf5_rel_path = h5_filename.split("/")[-1]
+        with open(xmf_filename, "w") as xmf:
+            xmf.write(f'''<?xml version="1.0" ?>
+    <!DOCTYPE Xdmf SYSTEM "Xdmf.dtd" []>
+    <Xdmf Version="3.0">
+        <Domain>
+            <Grid Name="VoxelMesh" GridType="Uniform">
+                <Topology TopologyType="Hexahedron" NumberOfElements="{total_cells}">
+                    <DataItem Dimensions="{total_cells} 8" NumberType="Int" Format="HDF">
+                        {hdf5_rel_path}:/Mesh/Connectivity
+                    </DataItem>
+                </Topology>
+                <Geometry GeometryType="XYZ">
+                    <DataItem Dimensions="{num_points} 3" NumberType="Float" Precision="4" Format="HDF">
+                        {hdf5_rel_path}:/Mesh/Points
+                    </DataItem>
+                </Geometry>
+                <Attribute Name="Level" AttributeType="Scalar" Center="Cell">
+                    <DataItem Dimensions="{total_cells}" NumberType="UInt8" Format="HDF">
+                        {hdf5_rel_path}:/Mesh/Level
+                    </DataItem>
+                </Attribute>
+        ''')
+            for field_name in fields.keys():
+                xmf.write(f'''
+            <Attribute Name="{field_name}" AttributeType="Scalar" Center="Cell">
+                <DataItem Dimensions="{total_cells}" NumberType="Float" Precision="4" Format="HDF">
+                {h5_filename}:/Fields/{field_name}
+                </DataItem>
+            </Attribute>
+            ''')
+            xmf.write("""
+                </Grid>
+            </Domain>
+        </Xdmf>
+        """)
+        print("\tXDMF file written successfully")
+        return
+
+    def write_hdf5_file(self, filename, coordinates, connectivity, level_id_field, field_data, compression="gzip", compression_opts=0):
+        """Write the processed mesh data to an HDF5 file.
+        Parameters
+        ----------
+        filename : str
+            The name of the output HDF5 file.
+        coordinates : numpy.ndarray
+            An array of all coordinates.
+        connectivity : numpy.ndarray
+            An array of all connectivity data.
+        level_id_field : numpy.ndarray
+            An array of all level data.
+        field_data : dict
+            A dictionary of all field data.
+        compression : str, optional
+            The compression method to use for the HDF5 file.
+        compression_opts : int, optional
+            The compression options to use for the HDF5 file.
+        """
+        import h5py
+
+        with h5py.File(filename + ".h5", "w") as f:
+            f.create_dataset("/Mesh/Points", data=coordinates, compression=compression, compression_opts=compression_opts, chunks=(100000, 3))
+            f.create_dataset(
+                "/Mesh/Connectivity",
+                data=connectivity,
+                compression=compression,
+                compression_opts=compression_opts,
+                chunks=(30000, 8),
+            )
+            f.create_dataset("/Mesh/Level", data=level_id_field, compression=compression, compression_opts=compression_opts)
+            fg = f.create_group("/Fields")
+            for fname, fdata in field_data.items():
+                fg.create_dataset(fname, data=fdata.astype(np.float32), compression=compression, compression_opts=compression_opts)
+
+    def _merge_duplicates(self, coordinates, connectivity):
+        # Merging duplicate points
+        tolerance = 0.01
+        grid_coords = np.round(coordinates / tolerance).astype(np.int64)
+        hash_keys = grid_coords[:, 0] + grid_coords[:, 1] * 1_000_000 + grid_coords[:, 2] * 1_000_000_000_000
+
+        _, unique_indices, inverse = np.unique(hash_keys, return_index=True, return_inverse=True)
+        coordinates = coordinates[unique_indices]
+        connectivity = inverse[connectivity]
+        return coordinates, connectivity
+
+    def _transform_coordinates(self, coordinates, scale, offset):
+        scale = np.array([scale] * 3 if isinstance(scale, (int, float)) else scale, dtype=np.float32)
+        offset = np.array(offset, dtype=np.float32)
+        return coordinates * scale + offset
+
+    def __call__(self, filename, velocity_neon, density_neon, compression="gzip", compression_opts=0, store_precision=None):
+        from typing import Any
+        import time
+        import neon
+        import warp as wp
+
+        from xlb.compute_backend import ComputeBackend
+        from xlb.grid import grid_factory
+        from xlb import DefaultConfig
+
+        # Ensure that this operator is called on multires grids
+        grid_mres = velocity_neon.get_grid()
+        assert grid_mres.get_name() == "mGrid", f"Operation {self.__class__.__name} is only applicable to multi-resolution cases"
+
+        # Set the default precision policy if not provided
+        if store_precision is None:
+            store_precision = DefaultConfig.default_precision_policy.store_precision
+
+        # number of levels
+        num_levels = grid_mres.get_num_levels()
+        assert num_levels == len(self.levels_data), "Error: Inconsistent number of levels"
+
+        # Prepare the fields to be written by transfering multi-res NEON fields into stacked warp fields
+        fields_data = {
+            "velocity_x": [],
+            "velocity_y": [],
+            "velocity_z": [],
+            "density": [],
+        }
+        for level in range(num_levels):
+            # get the shape of the grid at this level
+            box_shape = self.levels_data[level][0].shape
+
+            # Use the warp backend to create dense fields to be written in multi-res NEON fields
+            grid_dense = grid_factory(box_shape, compute_backend=ComputeBackend.WARP)
+            velocity_warp = grid_dense.create_field(cardinality=3, dtype=store_precision)
+            density_warp = grid_dense.create_field(cardinality=1, dtype=store_precision)
+            refinement = 2**level
+            origin_x, origin_y, origin_z = [int(x) for x in self.levels_data[level][2]]
+
+            @neon.Container.factory(name="HDF5MultiresExporter")
+            def container(
+                velocity_neon: Any,
+                density_neon: Any,
+                velocity_warp: Any,
+                density_warp: Any,
+            ):
+                def launcher(loader: neon.Loader):
+                    loader.set_mres_grid(velocity_neon.get_grid(), level)
+                    velocity_neon_hdl = loader.get_mres_read_handle(velocity_neon)
+                    density_neon_hdl = loader.get_mres_read_handle(density_neon)
+
+                    @wp.func
+                    def kernel(index: Any):
+                        cIdx = wp.neon_global_idx(velocity_neon_hdl, index)
+                        # Get local indices by dividing the global indices (associated with the finest level) by 2^level
+                        # Subtract the origin to get the local indices in the warp field
+                        lx = wp.neon_get_x(cIdx) // refinement - origin_x
+                        ly = wp.neon_get_y(cIdx) // refinement - origin_y
+                        lz = wp.neon_get_z(cIdx) // refinement - origin_z
+
+                        # write the values to the warp field
+                        density_warp[0, lx, ly, lz] = wp.neon_read(density_neon_hdl, index, 0)
+                        for card in range(3):
+                            velocity_warp[card, lx, ly, lz] = wp.neon_read(velocity_neon_hdl, index, card)
+
+                    loader.declare_kernel(kernel)
+
+                return launcher
+
+            # Create the container and run it to fill the warp fields
+            c = container(velocity_neon, density_neon, velocity_warp, density_warp)
+            c.run(0)
+            wp.synchronize()
+
+            # Convert the warp fields to numpy arrays and use level's mask to filter the data
+            mask = self.levels_data[level][0]
+            velocity_np = np.array(wp.to_jax(velocity_warp))
+            vx, vy, vz = velocity_np[0][mask], velocity_np[1][mask], velocity_np[2][mask]
+            rho = np.array(wp.to_jax(density_warp))[0][mask]
+            fields_data["velocity_x"].append(vx)
+            fields_data["velocity_y"].append(vy)
+            fields_data["velocity_z"].append(vz)
+            fields_data["density"].append(rho)
+
+            # Clean up
+            del velocity_warp
+            del density_warp
+
+        # Concatenate all field data
+        for field_name in fields_data.keys():
+            fields_data[field_name] = np.concatenate(fields_data[field_name])
+            assert fields_data[field_name].size == self.total_cells, f"Error: Field {field_name} size mismatch!"
+
+        # Save XDMF file
+        self.save_xdmf(filename + ".h5", filename + ".xmf", self.total_cells, len(self.coordinates), fields_data)
+
+        # Writing HDF5 file
+        print("\tWriting HDF5 file")
+        tic_write = time.perf_counter()
+        self.write_hdf5_file(filename, self.coordinates, self.connectivity, self.level_id_field, fields_data, compression, compression_opts)
+        toc_write = time.perf_counter()
+        print(f"\tHDF5 file written in {toc_write - tic_write:0.1f} seconds")

From 18d44b888644db1638434091716a30737141897b Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 7 Jul 2025 11:04:19 -0400
Subject: [PATCH 105/208] Fixed issues with ExportMultiresHDF5. Outputs are now
 correct!

---
 .../cuboid_flow_past_sphere_3d.py             |  13 +-
 .../stl_flow_past_sphere_3d.py                |   2 -
 xlb/utils/mesher.py                           | 148 +++++++++++-------
 3 files changed, 98 insertions(+), 65 deletions(-)

diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
index a75d86d4..21a7ba52 100644
--- a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
@@ -7,7 +7,6 @@
 from xlb.compute_backend import ComputeBackend
 from xlb.precision_policy import PrecisionPolicy
 from xlb.grid import multires_grid_factory
-from xlb.operator.stepper import MultiresIncompressibleNavierStokesStepper
 from xlb.operator.boundary_condition import (
     FullwayBounceBackBC,
     HalfwayBounceBackBC,
@@ -139,6 +138,7 @@ def prepare_sparsity_pattern(level_data):
 # Define exporter object for hdf5 output
 from xlb.utils import ExportMultiresHDF5
 
+# Define an exporter for the multiresolution data
 h5exporter = ExportMultiresHDF5(level_data)
 
 # Prepare the sparsity pattern and origins from the level data
@@ -229,9 +229,7 @@ def bc_profile_warp(index: wp.vec3i):
 
 def print_lift_drag(sim):
     # Compute lift and drag
-    wp.synchronize()
     boundary_force = momentum_transfer(sim.f_0, sim.f_1, sim.bc_mask, sim.missing_mask)
-    wp.synchronize()
     drag = boundary_force[0]  # x-direction
     lift = boundary_force[2]
     sphere_cross_section = np.pi * sphere_radius**2
@@ -249,12 +247,17 @@ def print_lift_drag(sim):
     sim.step()
 
     if step % post_process_interval == 0 or step == num_steps - 1:
+        # # Export VTK for comparison
+        # tic_write = time.perf_counter()
+        # sim.export_macroscopic("multires_flow_over_sphere_3d_")
+        # toc_write = time.perf_counter()
+        # print(f"\tVTK file written in {toc_write - tic_write:0.1f} seconds")
+
         # Call the Macroscopic operator to compute macroscopic fields
         sim.macro(sim.f_0, sim.bc_mask, sim.rho, sim.u, streamId=0)
-        wp.synchronize()
 
         # Call the exporter to save the current state
-        filename = f"multires_flow_over_sphere_3d_step_{step:04d}"
+        filename = f"multires_flow_over_sphere_3d_{step:04d}"
         h5exporter(filename, sim.u, sim.rho, compression="gzip", compression_opts=2)
 
         # Print lift and drag coefficients
diff --git a/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
index e1b9ec0f..abb275b8 100644
--- a/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
@@ -201,9 +201,7 @@ def bc_profile_warp(index: wp.vec3i):
 
 def print_lift_drag(sim):
     # Compute lift and drag
-    wp.synchronize()
     boundary_force = momentum_transfer(sim.f_0, sim.f_1, sim.bc_mask, sim.missing_mask)
-    wp.synchronize()
     drag = boundary_force[0]  # x-direction
     lift = boundary_force[2]
     sphere_cross_section = np.pi * sphere_radius**2
diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index 57af17a9..4f846f07 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -1,5 +1,9 @@
 import numpy as np
 import open3d as o3d
+from typing import Any
+
+import neon
+import warp as wp
 
 
 def adjust_bbox(cuboid_max, cuboid_min, voxel_size_coarsest):
@@ -163,6 +167,12 @@ def __init__(self, levels_data, scale=1, offset=(0.0, 0.0, 0.0)):
         self.level_id_field = level_id_field
         self.total_cells = total_cells
 
+        # Prepare and allocate the inputs for the NEON container
+        self.velocity_warp_list, self.density_warp_list, self.origin_list = self._prepare_container_inputs()
+
+        # Construct the NEON container for exporting multi-resolution data
+        self.container = self._construct_neon_container()
+
     def process_geometry(self, levels_data, scale):
         num_voxels_per_level = [np.sum(data) for data, _, _, _ in levels_data]
         num_points_per_level = [8 * nv for nv in num_voxels_per_level]
@@ -338,27 +348,86 @@ def _transform_coordinates(self, coordinates, scale, offset):
         offset = np.array(offset, dtype=np.float32)
         return coordinates * scale + offset
 
-    def __call__(self, filename, velocity_neon, density_neon, compression="gzip", compression_opts=0, store_precision=None):
-        from typing import Any
-        import time
-        import neon
-        import warp as wp
-
+    def _prepare_container_inputs(self, store_precision=None):
+        # load necessary modules
         from xlb.compute_backend import ComputeBackend
         from xlb.grid import grid_factory
         from xlb import DefaultConfig
 
-        # Ensure that this operator is called on multires grids
-        grid_mres = velocity_neon.get_grid()
-        assert grid_mres.get_name() == "mGrid", f"Operation {self.__class__.__name} is only applicable to multi-resolution cases"
+        # Get the number of levels from the levels_data
+        num_levels = len(self.levels_data)
 
         # Set the default precision policy if not provided
         if store_precision is None:
             store_precision = DefaultConfig.default_precision_policy.store_precision
 
+        # Prepare lists to hold warp fields and origins allocated for each level
+        velocity_warp_list = []
+        density_warp_list = []
+        origin_list = []
+        for level in range(num_levels):
+            # get the shape of the grid at this level
+            box_shape = self.levels_data[level][0].shape
+
+            # Use the warp backend to create dense fields to be written in multi-res NEON fields
+            grid_dense = grid_factory(box_shape, compute_backend=ComputeBackend.WARP)
+            velocity_warp_list.append(grid_dense.create_field(cardinality=3, dtype=store_precision))
+            density_warp_list.append(grid_dense.create_field(cardinality=1, dtype=store_precision))
+            origin_list.append(wp.vec3i(*([int(x) for x in self.levels_data[level][2]])))
+
+        return velocity_warp_list, density_warp_list, origin_list
+
+    def _construct_neon_container(self):
+        """
+        Constructs a NEON container for exporting multi-resolution data to HDF5.
+        This container will be used to transfer multi-resolution NEON fields into stacked warp fields.
+        """
+
+        @neon.Container.factory(name="HDF5MultiresExporter")
+        def container(
+            velocity_neon: Any,
+            density_neon: Any,
+            velocity_warp: Any,
+            density_warp: Any,
+            origin: Any,
+            level: Any,
+        ):
+            def launcher(loader: neon.Loader):
+                loader.set_mres_grid(velocity_neon.get_grid(), level)
+                velocity_neon_hdl = loader.get_mres_read_handle(velocity_neon)
+                density_neon_hdl = loader.get_mres_read_handle(density_neon)
+                refinement = 2**level
+
+                @wp.func
+                def kernel(index: Any):
+                    cIdx = wp.neon_global_idx(velocity_neon_hdl, index)
+                    # Get local indices by dividing the global indices (associated with the finest level) by 2^level
+                    # Subtract the origin to get the local indices in the warp field
+                    lx = wp.neon_get_x(cIdx) // refinement - origin[0]
+                    ly = wp.neon_get_y(cIdx) // refinement - origin[1]
+                    lz = wp.neon_get_z(cIdx) // refinement - origin[2]
+
+                    # write the values to the warp field
+                    density_warp[0, lx, ly, lz] = wp.neon_read(density_neon_hdl, index, 0)
+                    for card in range(3):
+                        velocity_warp[card, lx, ly, lz] = wp.neon_read(velocity_neon_hdl, index, card)
+
+                loader.declare_kernel(kernel)
+
+            return launcher
+
+        return container
+
+    def __call__(self, filename, velocity_neon, density_neon, compression="gzip", compression_opts=0, store_precision=None):
+        import time
+
+        # Ensure that this operator is called on multires grids
+        grid_mres = velocity_neon.get_grid()
+        assert grid_mres.get_name() == "mGrid", f"Operation {self.__class__.__name} is only applicable to multi-resolution cases"
+
         # number of levels
         num_levels = grid_mres.get_num_levels()
-        assert num_levels == len(self.levels_data), "Error: Inconsistent number of levels"
+        assert num_levels == len(self.levels_data), "Error: Inconsistent number of levels!"
 
         # Prepare the fields to be written by transfering multi-res NEON fields into stacked warp fields
         fields_data = {
@@ -368,65 +437,28 @@ def __call__(self, filename, velocity_neon, density_neon, compression="gzip", co
             "density": [],
         }
         for level in range(num_levels):
-            # get the shape of the grid at this level
-            box_shape = self.levels_data[level][0].shape
-
-            # Use the warp backend to create dense fields to be written in multi-res NEON fields
-            grid_dense = grid_factory(box_shape, compute_backend=ComputeBackend.WARP)
-            velocity_warp = grid_dense.create_field(cardinality=3, dtype=store_precision)
-            density_warp = grid_dense.create_field(cardinality=1, dtype=store_precision)
-            refinement = 2**level
-            origin_x, origin_y, origin_z = [int(x) for x in self.levels_data[level][2]]
-
-            @neon.Container.factory(name="HDF5MultiresExporter")
-            def container(
-                velocity_neon: Any,
-                density_neon: Any,
-                velocity_warp: Any,
-                density_warp: Any,
-            ):
-                def launcher(loader: neon.Loader):
-                    loader.set_mres_grid(velocity_neon.get_grid(), level)
-                    velocity_neon_hdl = loader.get_mres_read_handle(velocity_neon)
-                    density_neon_hdl = loader.get_mres_read_handle(density_neon)
-
-                    @wp.func
-                    def kernel(index: Any):
-                        cIdx = wp.neon_global_idx(velocity_neon_hdl, index)
-                        # Get local indices by dividing the global indices (associated with the finest level) by 2^level
-                        # Subtract the origin to get the local indices in the warp field
-                        lx = wp.neon_get_x(cIdx) // refinement - origin_x
-                        ly = wp.neon_get_y(cIdx) // refinement - origin_y
-                        lz = wp.neon_get_z(cIdx) // refinement - origin_z
-
-                        # write the values to the warp field
-                        density_warp[0, lx, ly, lz] = wp.neon_read(density_neon_hdl, index, 0)
-                        for card in range(3):
-                            velocity_warp[card, lx, ly, lz] = wp.neon_read(velocity_neon_hdl, index, card)
-
-                    loader.declare_kernel(kernel)
-
-                return launcher
 
             # Create the container and run it to fill the warp fields
-            c = container(velocity_neon, density_neon, velocity_warp, density_warp)
-            c.run(0)
-            wp.synchronize()
+            c = self.container(
+                velocity_neon,
+                density_neon,
+                self.velocity_warp_list[level],
+                self.density_warp_list[level],
+                self.origin_list[level],
+                level
+            )
+            c.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
 
             # Convert the warp fields to numpy arrays and use level's mask to filter the data
             mask = self.levels_data[level][0]
-            velocity_np = np.array(wp.to_jax(velocity_warp))
+            velocity_np = np.array(wp.to_jax(self.velocity_warp_list[level]))
+            rho = np.array(wp.to_jax(self.density_warp_list[level]))[0][mask]
             vx, vy, vz = velocity_np[0][mask], velocity_np[1][mask], velocity_np[2][mask]
-            rho = np.array(wp.to_jax(density_warp))[0][mask]
             fields_data["velocity_x"].append(vx)
             fields_data["velocity_y"].append(vy)
             fields_data["velocity_z"].append(vz)
             fields_data["density"].append(rho)
 
-            # Clean up
-            del velocity_warp
-            del density_warp
-
         # Concatenate all field data
         for field_name in fields_data.keys():
             fields_data[field_name] = np.concatenate(fields_data[field_name])

From 9b8455849fcf8757c2bb529ec8fd1b1f85c1dd18 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 8 Jul 2025 11:18:58 -0400
Subject: [PATCH 106/208] commented out bc_mask vtk export

---
 xlb/operator/stepper/nse_multires_stepper.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 763081ce..56a864a5 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -82,10 +82,10 @@ def prepare_fields(self, rho, u, initializer=None):
         # Initialize auxiliary data if needed
         f_1 = self._initialize_auxiliary_data(self.boundary_conditions, f_1, bc_mask, missing_mask)
         # bc_mask.update_host(0)
-        bc_mask.update_host(0)
-        f_0.update_host(0)
-        wp.synchronize()
-        bc_mask.export_vti("bc_mask.vti", "bc_mask")
+        # bc_mask.update_host(0)
+        # f_0.update_host(0)
+        # wp.synchronize()
+        # bc_mask.export_vti("bc_mask.vti", "bc_mask")
         # f_0.export_vti("init_f0.vti", 'init_f0')
         # missing_mask.export_vti("missing_mask.vti", 'missing_mask')
 

From 55c0cabb93e77493277eec33213a888397ff383e Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 11 Jul 2025 10:52:12 -0400
Subject: [PATCH 107/208] fixed the fp64 issues

---
 .../cuboid_flow_past_sphere_3d.py             | 23 +++++++++++--------
 .../bc_halfway_bounce_back.py                 |  6 +++--
 xlb/operator/boundary_condition/bc_hybrid.py  |  5 ++--
 .../force/multires_momentum_transfer.py       |  2 +-
 4 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
index 21a7ba52..b2b54222 100644
--- a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
@@ -171,22 +171,25 @@ def bc_profile():
 
     # Note nx, ny, nz are the dimensions of the grid at the finest level while the inlet is defined at the coarsest level
     nx, ny, nz = grid_shape_finest
-    H_y = float(ny // 2 ** (num_levels - 1) - 1)  # Height in y direction
-    H_z = float(nz // 2 ** (num_levels - 1) - 1)  # Height in z direction
+    dtype = precision_policy.compute_precision.wp_dtype
+    H_y = dtype(ny // 2 ** (num_levels - 1) - 1)  # Height in y direction
+    H_z = dtype(nz // 2 ** (num_levels - 1) - 1)  # Height in z direction
+    two = dtype(2.0)
+    u_max_wp = dtype(u_max)
 
     @wp.func
     def bc_profile_warp(index: wp.vec3i):
         # Poiseuille flow profile: parabolic velocity distribution
-        y = wp.float32(index[1])
-        z = wp.float32(index[2])
+        y = dtype(index[1])
+        z = dtype(index[2])
 
         # Calculate normalized distance from center
-        y_center = y - (H_y / 2.0)
-        z_center = z - (H_z / 2.0)
-        r_squared = (2.0 * y_center / H_y) ** 2.0 + (2.0 * z_center / H_z) ** 2.0
+        y_center = y - (H_y / two)
+        z_center = z - (H_z / two)
+        r_squared = (two * y_center / H_y) ** two + (two * z_center / H_z) ** two
 
         # Parabolic profile: u = u_max * (1 - r²)
-        return wp.vec(u_max * wp.max(0.0, 1.0 - r_squared), length=1)
+        return wp.vec(u_max_wp * wp.max(dtype(0.0), dtype(1.0) - r_squared), length=1)
 
     return bc_profile_warp
 
@@ -236,7 +239,7 @@ def print_lift_drag(sim):
     u_avg = 0.5 * u_max
     cd = 2.0 * drag / (u_avg**2 * sphere_cross_section)
     cl = 2.0 * lift / (u_avg**2 * sphere_cross_section)
-    print(f"CD={cd}, CL={cl}")
+    print(f"\tCD={cd}, CL={cl}")
 
 
 # -------------------------- Simulation Loop --------------------------
@@ -265,5 +268,5 @@ def print_lift_drag(sim):
         wp.synchronize()
         end_time = time.time()
         elapsed = end_time - start_time
-        print(f"Completed step {step}. Time elapsed for {post_process_interval} steps: {elapsed:.6f} seconds.")
+        print(f"\tCompleted step {step}. Time elapsed for {post_process_interval} steps: {elapsed:.6f} seconds.")
         start_time = time.time()
diff --git a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
index 3d9edeab..47800c67 100644
--- a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
@@ -84,13 +84,15 @@ def __init__(
             if self.compute_backend in [ComputeBackend.WARP, ComputeBackend.NEON]:
                 if self.velocity_set.d == 2:
                     prescribed_value = np.array([prescribed_value[0], prescribed_value[1], 0.0], dtype=np.float64)
-                prescribed_value = wp.vec(3, dtype=self.precision_policy.store_precision.wp_dtype)(prescribed_value)
+                prescribed_value = wp.vec(3, dtype=self.store_dtype)(prescribed_value)
             self.profile = self._create_constant_prescribed_profile(prescribed_value)
 
     def _create_constant_prescribed_profile(self, prescribed_value):
+        _u_vec = wp.vec(3, dtype=self.store_dtype)
+
         @wp.func
         def prescribed_profile_warp(index: Any, time: Any):
-            return wp.vec3(prescribed_value[0], prescribed_value[1], prescribed_value[2])
+            return _u_vec(prescribed_value[0], prescribed_value[1], prescribed_value[2])
 
         def prescribed_profile_jax():
             return jnp.array(prescribed_value, dtype=self.precision_policy.store_precision.jax_dtype).reshape(-1, 1)
diff --git a/xlb/operator/boundary_condition/bc_hybrid.py b/xlb/operator/boundary_condition/bc_hybrid.py
index 0c977368..e403a76c 100644
--- a/xlb/operator/boundary_condition/bc_hybrid.py
+++ b/xlb/operator/boundary_condition/bc_hybrid.py
@@ -99,11 +99,12 @@ def __init__(
                 prescribed_value = np.array([prescribed_value[0], prescribed_value[1], 0.0], dtype=np.float64)
 
             # create a constant prescribed profile
-            prescribed_value = wp.vec(3, dtype=self.compute_dtype)(prescribed_value)
+            _u_vec = wp.vec(3, dtype=self.store_dtype)
+            prescribed_value = _u_vec(prescribed_value)
 
             @wp.func
             def prescribed_profile_warp(index: Any, time: Any):
-                return wp.vec3(prescribed_value[0], prescribed_value[1], prescribed_value[2])
+                return _u_vec(prescribed_value[0], prescribed_value[1], prescribed_value[2])
 
             self.profile = prescribed_profile_warp
 
diff --git a/xlb/operator/force/multires_momentum_transfer.py b/xlb/operator/force/multires_momentum_transfer.py
index 92e297c9..d1db6b07 100644
--- a/xlb/operator/force/multires_momentum_transfer.py
+++ b/xlb/operator/force/multires_momentum_transfer.py
@@ -91,7 +91,7 @@ def neon_implementation(
         stream=0,
     ):
         # Ensure the force is initialized to zero
-        self.force *= 0.0
+        self.force *= self.compute_dtype(0.0)
 
         # Define the neon functionals needed for this operation
         self.stream_functional = self.stream.neon_functional

From 74b3247c9d76cebb953670d63ee69ea3a8583f24 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 11 Jul 2025 11:21:10 -0400
Subject: [PATCH 108/208] minor: updated rotating sphere example to use latest
 voxelization method

---
 examples/cfd/rotating_sphere_3d.py | 17 +++++++++++------
 xlb/utils/mesher.py                |  8 +-------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/examples/cfd/rotating_sphere_3d.py b/examples/cfd/rotating_sphere_3d.py
index 671feb90..93b6b2f8 100644
--- a/examples/cfd/rotating_sphere_3d.py
+++ b/examples/cfd/rotating_sphere_3d.py
@@ -1,6 +1,11 @@
 import xlb
 import trimesh
 import time
+import warp as wp
+import numpy as np
+import jax.numpy as jnp
+from typing import Any
+
 from xlb.compute_backend import ComputeBackend
 from xlb.precision_policy import PrecisionPolicy
 from xlb.grid import grid_factory
@@ -15,15 +20,11 @@
 from xlb.operator.force.momentum_transfer import MomentumTransfer
 from xlb.operator.macroscopic import Macroscopic
 from xlb.utils import save_fields_vtk, save_image
-import warp as wp
-import numpy as np
-import jax.numpy as jnp
 import matplotlib.pyplot as plt
 from xlb.operator.equilibrium import QuadraticEquilibrium
 from xlb.operator import Operator
-from typing import Any
 from xlb.velocity_set.velocity_set import VelocitySet
-
+from xlb.operator.boundary_masker import MeshVoxelizationMethod
 
 # -------------------------- Simulation Setup --------------------------
 
@@ -125,7 +126,11 @@ def bc_profile_warp(index: wp.vec3i, time: Any):
 bc_do_nothing = DoNothingBC(indices=outlet)
 # bc_sphere = HalfwayBounceBackBC(mesh_vertices=sphere, voxelization_method="ray", profile=bc_profile())
 bc_sphere = HybridBC(
-    bc_method="nonequilibrium_regularized", mesh_vertices=sphere, use_mesh_distance=True, voxelization_method="ray", profile=bc_profile()
+    bc_method="nonequilibrium_regularized",
+    mesh_vertices=sphere,
+    use_mesh_distance=True,
+    voxelization_method=MeshVoxelizationMethod.RAY,
+    profile=bc_profile(),
 )
 # Not assining BC for walls makes them periodic.
 boundary_conditions = [bc_left, bc_do_nothing, bc_sphere]
diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index 4f846f07..c656f61c 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -437,15 +437,9 @@ def __call__(self, filename, velocity_neon, density_neon, compression="gzip", co
             "density": [],
         }
         for level in range(num_levels):
-
             # Create the container and run it to fill the warp fields
             c = self.container(
-                velocity_neon,
-                density_neon,
-                self.velocity_warp_list[level],
-                self.density_warp_list[level],
-                self.origin_list[level],
-                level
+                velocity_neon, density_neon, self.velocity_warp_list[level], self.density_warp_list[level], self.origin_list[level], level
             )
             c.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
 

From d344a0c7aa5674eb2bde3378f33cd2169f0d33a9 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 11 Jul 2025 12:18:24 -0400
Subject: [PATCH 109/208] minor fix in other examples for fp64

---
 examples/cfd/flow_past_sphere_3d.py           | 16 +++++++++-----
 .../cuboid_flow_past_sphere_3d.py             |  2 +-
 .../grid_refinement/flow_past_sphere_3d.py    | 22 +++++++++++--------
 .../stl_flow_past_sphere_3d.py                | 21 ++++++++++--------
 examples/cfd/rotating_sphere_3d.py            |  9 ++++----
 .../bc_halfway_bounce_back.py                 |  2 +-
 6 files changed, 42 insertions(+), 30 deletions(-)

diff --git a/examples/cfd/flow_past_sphere_3d.py b/examples/cfd/flow_past_sphere_3d.py
index 12a2c9e1..6fc399bb 100644
--- a/examples/cfd/flow_past_sphere_3d.py
+++ b/examples/cfd/flow_past_sphere_3d.py
@@ -81,20 +81,24 @@ def bc_profile_jax():
         return bc_profile_jax
 
     elif compute_backend == ComputeBackend.WARP:
+        wp_dtype = precision_policy.compute_precision.wp_dtype
+        H_y = wp_dtype(grid_shape[1] - 1)  # Height in y direction
+        H_z = wp_dtype(grid_shape[2] - 1)  # Height in z direction
+        two = wp_dtype(2.0)
 
         @wp.func
         def bc_profile_warp(index: wp.vec3i):
             # Poiseuille flow profile: parabolic velocity distribution
-            y = wp.float32(index[1])
-            z = wp.float32(index[2])
+            y = wp_dtype(index[1])
+            z = wp_dtype(index[2])
 
             # Calculate normalized distance from center
-            y_center = y - (H_y / 2.0)
-            z_center = z - (H_z / 2.0)
-            r_squared = (2.0 * y_center / H_y) ** 2.0 + (2.0 * z_center / H_z) ** 2.0
+            y_center = y - (H_y / two)
+            z_center = z - (H_z / two)
+            r_squared = (two * y_center / H_y) ** two + (two * z_center / H_z) ** two
 
             # Parabolic profile: u = u_max * (1 - r²)
-            return wp.vec(u_max * wp.max(0.0, 1.0 - r_squared), length=1)
+            return wp.vec(wp_dtype(u_max) * wp.max(wp_dtype(0.0), wp_dtype(1.0) - r_squared), length=1)
 
         return bc_profile_warp
 
diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
index b2b54222..709e5ea9 100644
--- a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
@@ -170,7 +170,7 @@ def bc_profile():
     assert compute_backend == ComputeBackend.NEON
 
     # Note nx, ny, nz are the dimensions of the grid at the finest level while the inlet is defined at the coarsest level
-    nx, ny, nz = grid_shape_finest
+    _, ny, nz = grid_shape_finest
     dtype = precision_policy.compute_precision.wp_dtype
     H_y = dtype(ny // 2 ** (num_levels - 1) - 1)  # Height in y direction
     H_z = dtype(nz // 2 ** (num_levels - 1) - 1)  # Height in z direction
diff --git a/examples/cfd/grid_refinement/flow_past_sphere_3d.py b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
index 57adac64..1d76c5fa 100644
--- a/examples/cfd/grid_refinement/flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
@@ -82,23 +82,27 @@
 def bc_profile():
     assert compute_backend == ComputeBackend.NEON
 
-    # Note nx, ny, nz are the dimensions of the grid at the finest level
-    H_y = float(ny // 2 ** (num_levels - 1) - 1)  # Height in y direction
-    H_z = float(nz // 2 ** (num_levels - 1) - 1)  # Height in z direction
+    # Note nx, ny, nz are the dimensions of the grid at the finest level while the inlet is defined at the coarsest level
+    _, ny, nz = grid_shape
+    dtype = precision_policy.compute_precision.wp_dtype
+    H_y = dtype(ny // 2 ** (num_levels - 1) - 1)  # Height in y direction
+    H_z = dtype(nz // 2 ** (num_levels - 1) - 1)  # Height in z direction
+    two = dtype(2.0)
+    u_max_wp = dtype(u_max)
 
     @wp.func
     def bc_profile_warp(index: wp.vec3i):
         # Poiseuille flow profile: parabolic velocity distribution
-        y = wp.float32(index[1])
-        z = wp.float32(index[2])
+        y = dtype(index[1])
+        z = dtype(index[2])
 
         # Calculate normalized distance from center
-        y_center = y - (H_y / 2.0)
-        z_center = z - (H_z / 2.0)
-        r_squared = (2.0 * y_center / H_y) ** 2.0 + (2.0 * z_center / H_z) ** 2.0
+        y_center = y - (H_y / two)
+        z_center = z - (H_z / two)
+        r_squared = (two * y_center / H_y) ** two + (two * z_center / H_z) ** two
 
         # Parabolic profile: u = u_max * (1 - r²)
-        return wp.vec(u_max * wp.max(0.0, 1.0 - r_squared), length=1)
+        return wp.vec(u_max_wp * wp.max(dtype(0.0), dtype(1.0) - r_squared), length=1)
 
     return bc_profile_warp
 
diff --git a/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
index abb275b8..1b3cd039 100644
--- a/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
@@ -142,23 +142,26 @@ def bc_profile():
     assert compute_backend == ComputeBackend.NEON
 
     # Note nx, ny, nz are the dimensions of the grid at the finest level while the inlet is defined at the coarsest level
-    nx, ny, nz = grid_shape
-    H_y = float(ny // 2 ** (num_levels - 1) - 1)  # Height in y direction
-    H_z = float(nz // 2 ** (num_levels - 1) - 1)  # Height in z direction
+    _, ny, nz = grid_shape
+    dtype = precision_policy.compute_precision.wp_dtype
+    H_y = dtype(ny // 2 ** (num_levels - 1) - 1)  # Height in y direction
+    H_z = dtype(nz // 2 ** (num_levels - 1) - 1)  # Height in z direction
+    two = dtype(2.0)
+    u_max_wp = dtype(u_max)
 
     @wp.func
     def bc_profile_warp(index: wp.vec3i):
         # Poiseuille flow profile: parabolic velocity distribution
-        y = wp.float32(index[1])
-        z = wp.float32(index[2])
+        y = dtype(index[1])
+        z = dtype(index[2])
 
         # Calculate normalized distance from center
-        y_center = y - (H_y / 2.0)
-        z_center = z - (H_z / 2.0)
-        r_squared = (2.0 * y_center / H_y) ** 2.0 + (2.0 * z_center / H_z) ** 2.0
+        y_center = y - (H_y / two)
+        z_center = z - (H_z / two)
+        r_squared = (two * y_center / H_y) ** two + (two * z_center / H_z) ** two
 
         # Parabolic profile: u = u_max * (1 - r²)
-        return wp.vec(u_max * wp.max(0.0, 1.0 - r_squared), length=1)
+        return wp.vec(u_max_wp * wp.max(dtype(0.0), dtype(1.0) - r_squared), length=1)
 
     return bc_profile_warp
 
diff --git a/examples/cfd/rotating_sphere_3d.py b/examples/cfd/rotating_sphere_3d.py
index 93b6b2f8..a496b96a 100644
--- a/examples/cfd/rotating_sphere_3d.py
+++ b/examples/cfd/rotating_sphere_3d.py
@@ -105,16 +105,17 @@
 
 # Define rotating boundary profile
 def bc_profile():
-    _u_vec = wp.vec(velocity_set.d, dtype=precision_policy.compute_precision.wp_dtype)
+    dtype = precision_policy.compute_precision.wp_dtype
+    _u_vec = wp.vec(velocity_set.d, dtype=dtype)
     angular_velocity = _u_vec(0.0, rot_rate, 0.0)
     origin_np = shift + diam / 2
     origin_wp = _u_vec(origin_np[0], origin_np[1], origin_np[2])
 
     @wp.func
     def bc_profile_warp(index: wp.vec3i, time: Any):
-        x = wp.float32(index[0])
-        y = wp.float32(index[1])
-        z = wp.float32(index[2])
+        x = dtype(index[0])
+        y = dtype(index[1])
+        z = dtype(index[2])
         surface_coord = _u_vec(x, y, z) - origin_wp
         return wp.cross(angular_velocity, surface_coord)
 
diff --git a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
index 47800c67..e458efa7 100644
--- a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
@@ -88,7 +88,7 @@ def __init__(
             self.profile = self._create_constant_prescribed_profile(prescribed_value)
 
     def _create_constant_prescribed_profile(self, prescribed_value):
-        _u_vec = wp.vec(3, dtype=self.store_dtype)
+        _u_vec = wp.vec(3, dtype=self.precision_policy.store_precision.wp_dtype)
 
         @wp.func
         def prescribed_profile_warp(index: Any, time: Any):

From 8c2a251622090c548b7011ae1475082d1b557cb8 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 11 Jul 2025 13:39:43 -0400
Subject: [PATCH 110/208] Added an OutletInitializer as a helper initializer
 class

---
 examples/cfd/rotating_sphere_3d.py | 59 +----------------------------
 xlb/helper/__init__.py             |  1 +
 xlb/helper/initializers.py         | 60 ++++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+), 57 deletions(-)

diff --git a/examples/cfd/rotating_sphere_3d.py b/examples/cfd/rotating_sphere_3d.py
index a496b96a..00415ef5 100644
--- a/examples/cfd/rotating_sphere_3d.py
+++ b/examples/cfd/rotating_sphere_3d.py
@@ -144,64 +144,9 @@ def bc_profile_warp(index: wp.vec3i, time: Any):
     collision_type="KBC",
 )
 
-
-# Defining an initializer for outlet only
-class OutletInitializer(Operator):
-    def __init__(
-        self,
-        wind_speed=None,
-        grid_shape=None,
-        velocity_set: VelocitySet = None,
-        precision_policy=None,
-        compute_backend=None,
-    ):
-        self.wind_speed = wind_speed
-        self.rho = 1.0
-        self.grid_shape = grid_shape
-        self.equilibrium = QuadraticEquilibrium(velocity_set=velocity_set, precision_policy=precision_policy, compute_backend=compute_backend)
-        super().__init__(velocity_set, precision_policy, compute_backend)
-
-    def _construct_warp(self):
-        nx, ny, nz = self.grid_shape
-        _q = self.velocity_set.q
-        _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
-        _rho = self.compute_dtype(self.rho)
-        _u = _u_vec(self.wind_speed, 0.0, 0.0)
-        _w = self.velocity_set.w
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel(f: wp.array4d(dtype=Any)):
-            # Get the global index
-            i, j, k = wp.tid()
-            index = wp.vec3i(i, j, k)
-
-            # Set the velocity at the outlet (i.e. where i = nx-1)
-            if index[0] == nx - 1:
-                _feq = self.equilibrium.warp_functional(_rho, _u)
-                for l in range(_q):
-                    f[l, index[0], index[1], index[2]] = _feq[l]
-            else:
-                # In the rest of the domain, we assume zero velocity and equilibrium distribution.
-                for l in range(_q):
-                    f[l, index[0], index[1], index[2]] = _w[l]
-
-        return None, kernel
-
-    @Operator.register_backend(xlb.ComputeBackend.WARP)
-    def warp_implementation(self, f):
-        # Launch the warp kernel
-        wp.launch(
-            self.warp_kernel,
-            inputs=[
-                f,
-            ],
-            dim=f.shape[1:],
-        )
-        return f
-
-
 # Make initializer operator
+from xlb.helper.initializers import OutletInitializer
+
 initializer = OutletInitializer(
     wind_speed=wind_speed,
     grid_shape=grid_shape,
diff --git a/xlb/helper/__init__.py b/xlb/helper/__init__.py
index d6aa42c3..687dc547 100644
--- a/xlb/helper/__init__.py
+++ b/xlb/helper/__init__.py
@@ -1,5 +1,6 @@
 from xlb.helper.nse_fields import create_nse_fields
 from xlb.helper.initializers import initialize_eq
 from xlb.helper.initializers import initialize_multires_eq
+from xlb.helper.initializers import OutletInitializer
 from xlb.helper.check_boundary_overlaps import check_bc_overlaps
 from xlb.helper.simulation_manager import MultiresSimulationManager
diff --git a/xlb/helper/initializers.py b/xlb/helper/initializers.py
index 6ad4dfa2..d6e3343c 100644
--- a/xlb/helper/initializers.py
+++ b/xlb/helper/initializers.py
@@ -1,3 +1,7 @@
+import warp as wp
+from typing import Any
+from xlb.operator import Operator
+from xlb.velocity_set import VelocitySet
 from xlb.compute_backend import ComputeBackend
 from xlb.operator.equilibrium import QuadraticEquilibrium
 from xlb.operator.equilibrium import MultiresQuadraticEquilibrium
@@ -28,3 +32,59 @@ def initialize_multires_eq(f, grid, velocity_set, precision_policy, backend, rho
     equilibrium = MultiresQuadraticEquilibrium()
     equilibrium(rho, u, f, stream=0)
     return f
+
+
+# Defining an initializer for outlet only
+class OutletInitializer(Operator):
+    def __init__(
+        self,
+        wind_speed=None,
+        grid_shape=None,
+        velocity_set: VelocitySet = None,
+        precision_policy=None,
+        compute_backend=None,
+    ):
+        self.wind_speed = wind_speed
+        self.rho = 1.0
+        self.grid_shape = grid_shape
+        self.equilibrium = QuadraticEquilibrium(velocity_set=velocity_set, precision_policy=precision_policy, compute_backend=compute_backend)
+        super().__init__(velocity_set, precision_policy, compute_backend)
+
+    def _construct_warp(self):
+        nx, ny, nz = self.grid_shape
+        _q = self.velocity_set.q
+        _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
+        _rho = self.compute_dtype(self.rho)
+        _u = _u_vec(self.wind_speed, 0.0, 0.0)
+        _w = self.velocity_set.w
+
+        # Construct the warp kernel
+        @wp.kernel
+        def kernel(f: wp.array4d(dtype=Any)):
+            # Get the global index
+            i, j, k = wp.tid()
+            index = wp.vec3i(i, j, k)
+
+            # Set the velocity at the outlet (i.e. where i = nx-1)
+            if index[0] == nx - 1:
+                _feq = self.equilibrium.warp_functional(_rho, _u)
+                for l in range(_q):
+                    f[l, index[0], index[1], index[2]] = _feq[l]
+            else:
+                # In the rest of the domain, we assume zero velocity and equilibrium distribution.
+                for l in range(_q):
+                    f[l, index[0], index[1], index[2]] = _w[l]
+
+        return None, kernel
+
+    @Operator.register_backend(ComputeBackend.WARP)
+    def warp_implementation(self, f):
+        # Launch the warp kernel
+        wp.launch(
+            self.warp_kernel,
+            inputs=[
+                f,
+            ],
+            dim=f.shape[1:],
+        )
+        return f

From d9506af86adc782940ac72f098f30c416b0a1c13 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 11 Jul 2025 16:37:04 -0400
Subject: [PATCH 111/208] replaced open3d with trimesh in all files and
 examples which gets rid of weird voxelization artifacts due to open3d

---
 .../cuboid_flow_past_sphere_3d.py             | 19 +++++-----
 .../stl_flow_past_sphere_3d.py                | 35 ++++++++-----------
 xlb/utils/mesher.py                           | 15 ++++----
 3 files changed, 30 insertions(+), 39 deletions(-)

diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
index 709e5ea9..9ddd56f6 100644
--- a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
@@ -25,7 +25,7 @@ def generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part):
     """
     Generate a cuboid mesh based on the provided voxel size and domain multipliers.
     """
-    import open3d as o3d
+    import trimesh
     import os
 
     # Domain multipliers for each refinement level
@@ -42,14 +42,13 @@ def generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part):
     ]
 
     # Load the mesh
-    mesh = o3d.io.read_triangle_mesh(stl_filename)
-    if mesh.is_empty():
-        raise ValueError("Loaded mesh is empty or invalid.")
+    mesh = trimesh.load_mesh(stl_filename, process=False)
+    assert not mesh.is_empty, ValueError("Loaded mesh is empty or invalid.")
 
     # Compute original bounds
-    aabb = mesh.get_axis_aligned_bounding_box()
-    min_bound = aabb.get_min_bound()
-    max_bound = aabb.get_max_bound()
+    # Find voxel size and sphere radius
+    min_bound = mesh.vertices.min(axis=0)
+    max_bound = mesh.vertices.max(axis=0)
     partSize = max_bound - min_bound
 
     # smallest voxel size
@@ -66,10 +65,10 @@ def generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part):
     )
 
     # Apply translation and save out temp stl
-    mesh.translate(shift)
-    mesh.compute_vertex_normals()
+    mesh.apply_translation(shift)
+    _ = mesh.vertex_normals
     mesh_vertices = np.asarray(mesh.vertices) / voxel_size
-    o3d.io.write_triangle_mesh("temp.stl", mesh)
+    mesh.export("temp.stl")
 
     # Mesh based on temp stl
     level_data = make_cuboid_mesh(
diff --git a/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
index 1b3cd039..a840ae23 100644
--- a/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
@@ -26,40 +26,33 @@ def generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part, grid_shape
     """
     Generate a cuboid mesh based on the provided voxel size and domain multipliers.
     """
-    import open3d as o3d
-    import os
+    import trimesh
 
     # STL position
     nx, ny, nz = grid_shape
     sphere_origin = (nx // 6, ny // 2, nz // 2)
 
     # Load the mesh
-    mesh = o3d.io.read_triangle_mesh(stl_filename)
-    if mesh.is_empty():
-        raise ValueError("Loaded mesh is empty or invalid.")
-
-    # Compute original bounds
-    aabb = mesh.get_axis_aligned_bounding_box()
-    min_bound = aabb.get_min_bound()
-    max_bound = aabb.get_max_bound()
-    partSize = max_bound - min_bound
-    sphere_diameter_phys_units = float(min(partSize))
-
-    # smallest voxel size
+    mesh = trimesh.load_mesh(stl_filename, process=False)
+    assert not mesh.is_empty, ValueError("Loaded mesh is empty or invalid.")
+    mesh_vertices = mesh.vertices
+
+    # Find voxel size and sphere radius
+    min_bound = mesh_vertices.min(axis=0)
+    max_bound = mesh_vertices.max(axis=0)
+    mesh_extents = max_bound - min_bound
+    sphere_diameter_phys_units = float(min(mesh_extents))
     voxel_size = sphere_diameter_phys_units / num_finest_voxels_across_part
     sphere_radius = sphere_diameter_phys_units / voxel_size / 2.0
 
-    # Compute translation to put mesh into first octant of that domain—
+    # Compute translation to put mesh into first octant of that domain
     shift = np.array(sphere_origin) * voxel_size - sphere_diameter_phys_units / 2.0 - min_bound
 
     # Apply translation and save out temp stl
-    mesh.translate(shift)
-    mesh.compute_vertex_normals()
-    mesh_vertices = np.asarray(mesh.vertices) / voxel_size
-    o3d.io.write_triangle_mesh("temp.stl", mesh)
-    os.remove("temp.stl")
+    mesh_vertices = mesh_vertices + shift
+    mesh_vertices = np.asarray(mesh_vertices) / voxel_size
 
-    # Mesh base don temp stl
+    # Mesh based on temp stl
     # Create the multires grid
     num_levels = 3
     level_origins = []
diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index c656f61c..a3ec50f1 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -1,5 +1,5 @@
 import numpy as np
-import open3d as o3d
+import trimesh
 from typing import Any
 
 import neon
@@ -23,7 +23,7 @@ def adjust_bbox(cuboid_max, cuboid_min, voxel_size_coarsest):
     return adjusted_min, adjusted_max
 
 
-def make_cuboid_mesh(voxel_size, cuboids, stl_name):
+def make_cuboid_mesh(voxel_size, cuboids, stl_filename):
     """
     Create a multi-level cuboid mesh with bounding boxes aligned to the level 0 grid.
     Voxel matrices are set to ones only in regions not covered by finer levels.
@@ -37,13 +37,12 @@ def make_cuboid_mesh(voxel_size, cuboids, stl_name):
         list: Level data with voxel matrices, voxel sizes, origins, and levels.
     """
     # Load the mesh and get its bounding box
-    mesh = o3d.io.read_triangle_mesh(stl_name)
-    if mesh.is_empty():
-        raise ValueError("Loaded mesh is empty or invalid.")
+    mesh = trimesh.load_mesh(stl_filename, process=False)
+    assert not mesh.is_empty, ValueError("Loaded mesh is empty or invalid.")
 
-    aabb = mesh.get_axis_aligned_bounding_box()
-    min_bound = aabb.get_min_bound()
-    max_bound = aabb.get_max_bound()
+    mesh_vertices = mesh.vertices
+    min_bound = mesh_vertices.min(axis=0)
+    max_bound = mesh_vertices.max(axis=0)
     partSize = max_bound - min_bound
 
     level_data = []

From 4cfad42136b3e7ad865b82b919e9d5822ece2f63 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 11 Jul 2025 22:33:21 -0400
Subject: [PATCH 112/208] fixing CPU memory issue: merging duplicates in chucks

---
 xlb/utils/mesher.py | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index a3ec50f1..b8e8774f 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -334,12 +334,31 @@ def write_hdf5_file(self, filename, coordinates, connectivity, level_id_field, f
     def _merge_duplicates(self, coordinates, connectivity):
         # Merging duplicate points
         tolerance = 0.01
-        grid_coords = np.round(coordinates / tolerance).astype(np.int64)
-        hash_keys = grid_coords[:, 0] + grid_coords[:, 1] * 1_000_000 + grid_coords[:, 2] * 1_000_000_000_000
-
-        _, unique_indices, inverse = np.unique(hash_keys, return_index=True, return_inverse=True)
-        coordinates = coordinates[unique_indices]
-        connectivity = inverse[connectivity]
+        chunk_size = 10_000_000  # Adjust based on GPU memory
+        num_points = coordinates.shape[0]
+        unique_points = []
+        mapping = np.zeros(num_points, dtype=np.int32)
+        unique_idx = 0
+
+        for start in range(0, num_points, chunk_size):
+            end = min(start + chunk_size, num_points)
+            coords_chunk = coordinates[start:end]
+
+            # Simple hashing: grid coordinates as tuple keys
+            grid_coords = np.round(coords_chunk / tolerance).astype(np.int64)
+            hash_keys = (grid_coords[:, 0] +
+                         grid_coords[:, 1] * 1_000_000 +
+                         grid_coords[:, 2] * 1_000_000_000_000)
+            unique_hash, inverse = np.unique(hash_keys, return_inverse=True)
+            unique_hash, unique_indices, inverse = np.unique(hash_keys, return_index=True, return_inverse=True)
+            unique_chunk = coords_chunk[unique_indices]
+
+            unique_points.append(unique_chunk)
+            mapping[start:end] = inverse + unique_idx
+            unique_idx += len(unique_hash)
+
+        coordinates = np.concatenate(unique_points)
+        connectivity = mapping[connectivity]
         return coordinates, connectivity
 
     def _transform_coordinates(self, coordinates, scale, offset):

From 54750d7d87ac9e9ee70155cadc6bfd6ce7af6ffc Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 14 Jul 2025 15:11:22 -0400
Subject: [PATCH 113/208] Generalized outlet initializer definition using
 bc_mask and made generic functional to handle both warp and neon

---
 examples/cfd/rotating_sphere_3d.py  |  4 +-
 xlb/helper/initializers.py          | 88 +++++++++++++++++++++--------
 xlb/operator/stepper/nse_stepper.py | 45 +++------------
 xlb/utils/mesher.py                 |  4 +-
 4 files changed, 78 insertions(+), 63 deletions(-)

diff --git a/examples/cfd/rotating_sphere_3d.py b/examples/cfd/rotating_sphere_3d.py
index 00415ef5..008ab71e 100644
--- a/examples/cfd/rotating_sphere_3d.py
+++ b/examples/cfd/rotating_sphere_3d.py
@@ -148,8 +148,8 @@ def bc_profile_warp(index: wp.vec3i, time: Any):
 from xlb.helper.initializers import OutletInitializer
 
 initializer = OutletInitializer(
-    wind_speed=wind_speed,
-    grid_shape=grid_shape,
+    outlet_bc_id=bc_do_nothing.id,
+    wind_vector=(wind_speed, 0.0, 0.0),
     velocity_set=velocity_set,
     precision_policy=precision_policy,
     compute_backend=compute_backend,
diff --git a/xlb/helper/initializers.py b/xlb/helper/initializers.py
index d6e3343c..efaea33d 100644
--- a/xlb/helper/initializers.py
+++ b/xlb/helper/initializers.py
@@ -5,6 +5,7 @@
 from xlb.compute_backend import ComputeBackend
 from xlb.operator.equilibrium import QuadraticEquilibrium
 from xlb.operator.equilibrium import MultiresQuadraticEquilibrium
+import neon
 
 
 def initialize_eq(f, grid, velocity_set, precision_policy, compute_backend, rho=None, u=None):
@@ -38,53 +39,96 @@ def initialize_multires_eq(f, grid, velocity_set, precision_policy, backend, rho
 class OutletInitializer(Operator):
     def __init__(
         self,
-        wind_speed=None,
-        grid_shape=None,
+        outlet_bc_id: int = None,
+        wind_vector=None,
         velocity_set: VelocitySet = None,
         precision_policy=None,
         compute_backend=None,
     ):
-        self.wind_speed = wind_speed
+        assert outlet_bc_id is not None, "Outlet BC ID must be provided."
+        self.outlet_bc_id = outlet_bc_id
+        self.wind_vector = wind_vector
         self.rho = 1.0
-        self.grid_shape = grid_shape
-        self.equilibrium = QuadraticEquilibrium(velocity_set=velocity_set, precision_policy=precision_policy, compute_backend=compute_backend)
+        self.equilibrium = QuadraticEquilibrium(
+            velocity_set=velocity_set,
+            precision_policy=precision_policy,
+            compute_backend=ComputeBackend.WARP,
+        )
         super().__init__(velocity_set, precision_policy, compute_backend)
 
     def _construct_warp(self):
-        nx, ny, nz = self.grid_shape
         _q = self.velocity_set.q
         _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
+        _u = _u_vec(self.wind_vector[0], self.wind_vector[1], self.wind_vector[2])
         _rho = self.compute_dtype(self.rho)
-        _u = _u_vec(self.wind_speed, 0.0, 0.0)
         _w = self.velocity_set.w
+        outlet_bc_id = self.outlet_bc_id
+
+        @wp.func
+        def functional(index: Any, bc_mask: Any, f_field: Any):
+            # Check if the index corresponds to the outlet
+            if self.read_field(bc_mask, index, 0) == outlet_bc_id:
+                _feq = self.equilibrium.warp_functional(_rho, _u)
+                for l in range(_q):
+                    self.write_field(f_field, index, l, _feq[l])
+            else:
+                # In the rest of the domain, we assume zero velocity and equilibrium distribution.
+                for l in range(_q):
+                    self.write_field(f_field, index, l, _w[l])
 
         # Construct the warp kernel
         @wp.kernel
-        def kernel(f: wp.array4d(dtype=Any)):
+        def kernel(
+            bc_mask: wp.array4d(dtype=wp.uint8),
+            f_field: wp.array4d(dtype=Any),
+        ):
             # Get the global index
             i, j, k = wp.tid()
             index = wp.vec3i(i, j, k)
 
             # Set the velocity at the outlet (i.e. where i = nx-1)
-            if index[0] == nx - 1:
-                _feq = self.equilibrium.warp_functional(_rho, _u)
-                for l in range(_q):
-                    f[l, index[0], index[1], index[2]] = _feq[l]
-            else:
-                # In the rest of the domain, we assume zero velocity and equilibrium distribution.
-                for l in range(_q):
-                    f[l, index[0], index[1], index[2]] = _w[l]
+            functional(index, bc_mask, f_field)
 
         return None, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
-    def warp_implementation(self, f):
+    def warp_implementation(self, bc_mask, f_field):
         # Launch the warp kernel
         wp.launch(
             self.warp_kernel,
-            inputs=[
-                f,
-            ],
-            dim=f.shape[1:],
+            inputs=[bc_mask, f_field],
+            dim=f_field.shape[1:],
         )
-        return f
+        return f_field
+
+    def _construct_neon(self):
+        # Use the warp functional for the NEON backend
+        functional, _ = self._construct_warp()
+
+        @neon.Container.factory(name="OutletInitializer")
+        def container(
+            bc_mask: Any,
+            f_field: Any,
+        ):
+            def launcher(loader: neon.Loader):
+                loader.set_grid(f_field.get_grid())
+                f_field_pn = loader.get_write_handle(f_field)
+                bc_mask_pn = loader.get_read_handle(bc_mask)
+
+                @wp.func
+                def kernel(index: Any):
+                    # apply the functional
+                    functional(index, bc_mask_pn, f_field_pn)
+
+                loader.declare_kernel(kernel)
+
+            return launcher
+
+        return _, container
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(self, bc_mask, f_field, stream=0):
+        # Launch the neon container
+        c = self.neon_container(bc_mask, f_field)
+        c.run(stream, container_runtime=neon.Container.ContainerRuntime.neon)
+        return f_field
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index e804678e..3adb1d00 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -77,14 +77,6 @@ def prepare_fields(self, initializer=None):
             grid=self.grid, velocity_set=self.velocity_set, compute_backend=self.compute_backend, precision_policy=self.precision_policy
         )
 
-        # Initialize distribution functions if initializer is provided
-        if initializer is not None:
-            f_0 = initializer(f_0)
-        else:
-            from xlb.helper.initializers import initialize_eq
-
-            f_0 = initialize_eq(f_0, self.grid, self.velocity_set, self.precision_policy, self.compute_backend)
-
         # Copy f_0 using backend-specific copy to f_1
         if self.compute_backend == ComputeBackend.JAX:
             f_1 = f_0.copy()
@@ -92,35 +84,8 @@ def prepare_fields(self, initializer=None):
             wp.copy(f_1, f_0)
         if self.compute_backend == ComputeBackend.NEON:
             f_1.copy_from_run(f_0, 0)
-        if True:
-            import xlb.velocity_set
-            from xlb.operator.macroscopic import Macroscopic
-
-            # macro = Macroscopic(
-            #     compute_backend=ComputeBackend.NEON,
-            #     precision_policy=self.precision_policy,
-            #     velocity_set=xlb.velocity_set.D3Q19(precision_policy=self.precision_policy, backend=ComputeBackend.NEON),
-            # )
-            rho = self.grid.create_field(1, dtype=self.precision_policy.store_precision)
-            u = self.grid.create_field(3, dtype=self.precision_policy.store_precision)
-            # rho, u = macro(f_0, rho, u)
-            # wp.synchronize()
-            # wp.synchronize()
-            # u.update_host(0)
-            # rho.update_host(0)
-            # wp.synchronize()
-            # u.export_vti("u_init.vti", 'u')
-            # rho.export_vti("rho_init.vti", 'rho')
-            # rho, u = macro(f_1, rho, u)
-            # wp.synchronize()
-            # wp.synchronize()
-            # u.update_host(0)
-            # rho.update_host(0)
-            # wp.synchronize()
-            # u.export_vti("u_f1_init.vti", 'u')
-            # rho.export_vti("rho_f1_init.vti", 'rho')
-        # Important note: XLB uses f_1 buffer (center index and missing directions) to store auxiliary data for boundary conditions.
 
+        # Important note: XLB uses f_1 buffer (center index and missing directions) to store auxiliary data for boundary conditions.
         # Process boundary conditions and update masks
         f_1, bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, f_1, bc_mask, missing_mask)
 
@@ -132,6 +97,14 @@ def prepare_fields(self, initializer=None):
         # bc_mask.export_vti("bc_mask.vti", 'bc_mask')
         # missing_mask.export_vti("missing_mask.vti", 'missing_mask')
 
+        # Initialize distribution functions if initializer is provided
+        if initializer is not None:
+            f_0 = initializer(bc_mask, f_0)
+        else:
+            from xlb.helper.initializers import initialize_eq
+
+            f_0 = initialize_eq(f_0, self.grid, self.velocity_set, self.precision_policy, self.compute_backend)
+
         return f_0, f_1, bc_mask, missing_mask
 
     @classmethod
diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index b8e8774f..ff33983b 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -346,9 +346,7 @@ def _merge_duplicates(self, coordinates, connectivity):
 
             # Simple hashing: grid coordinates as tuple keys
             grid_coords = np.round(coords_chunk / tolerance).astype(np.int64)
-            hash_keys = (grid_coords[:, 0] +
-                         grid_coords[:, 1] * 1_000_000 +
-                         grid_coords[:, 2] * 1_000_000_000_000)
+            hash_keys = grid_coords[:, 0] + grid_coords[:, 1] * 1_000_000 + grid_coords[:, 2] * 1_000_000_000_000
             unique_hash, inverse = np.unique(hash_keys, return_inverse=True)
             unique_hash, unique_indices, inverse = np.unique(hash_keys, return_index=True, return_inverse=True)
             unique_chunk = coords_chunk[unique_indices]

From 554835d4c1cc509ca4a61c850df0a2ef38eb61ca Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 14 Jul 2025 16:24:14 -0400
Subject: [PATCH 114/208] Added multires outlet initializer

---
 .../cuboid_flow_past_sphere_3d.py             | 12 +++++
 xlb/helper/initializers.py                    | 50 ++++++++++++++++++-
 xlb/helper/simulation_manager.py              |  4 +-
 xlb/operator/stepper/nse_multires_stepper.py  | 13 +++--
 4 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
index 9ddd56f6..827534c9 100644
--- a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
@@ -216,12 +216,24 @@ def bc_profile_warp(index: wp.vec3i):
 visc = u_max * num_finest_voxels_across_part / Re
 omega = 1.0 / (3.0 * visc + 0.5)
 
+# Make initializer operator
+from xlb.helper.initializers import MultiresOutletInitializer
+
+initializer = MultiresOutletInitializer(
+    outlet_bc_id=bc_outlet.id,
+    wind_vector=(u_max, 0.0, 0.0),
+    velocity_set=velocity_set,
+    precision_policy=precision_policy,
+    compute_backend=compute_backend,
+)
+
 # Define a multi-resolution simulation manager
 sim = xlb.helper.MultiresSimulationManager(
     omega=omega,
     grid=grid,
     boundary_conditions=boundary_conditions,
     collision_type="KBC",
+    initializer=initializer,
 )
 
 # Setup Momentum Transfer for Force Calculation
diff --git a/xlb/helper/initializers.py b/xlb/helper/initializers.py
index efaea33d..20c7b779 100644
--- a/xlb/helper/initializers.py
+++ b/xlb/helper/initializers.py
@@ -89,7 +89,7 @@ def kernel(
             # Set the velocity at the outlet (i.e. where i = nx-1)
             functional(index, bc_mask, f_field)
 
-        return None, kernel
+        return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(self, bc_mask, f_field):
@@ -132,3 +132,51 @@ def neon_implementation(self, bc_mask, f_field, stream=0):
         c = self.neon_container(bc_mask, f_field)
         c.run(stream, container_runtime=neon.Container.ContainerRuntime.neon)
         return f_field
+
+
+# Defining an initializer for outlet only
+class MultiresOutletInitializer(OutletInitializer):
+    def __init__(
+        self,
+        outlet_bc_id: int = None,
+        wind_vector=None,
+        velocity_set: VelocitySet = None,
+        precision_policy=None,
+        compute_backend=None,
+    ):
+        super().__init__(outlet_bc_id, wind_vector, velocity_set, precision_policy, compute_backend)
+
+    def _construct_neon(self):
+        # Use the warp functional for the NEON backend
+        functional, _ = self._construct_warp()
+
+        @neon.Container.factory(name="MultiresOutletInitializer")
+        def container(
+            bc_mask: Any,
+            f_field: Any,
+            level: Any,
+        ):
+            def launcher(loader: neon.Loader):
+                loader.set_mres_grid(f_field.get_grid(), level)
+                f_field_pn = loader.get_mres_write_handle(f_field)
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask)
+
+                @wp.func
+                def kernel(index: Any):
+                    # apply the functional
+                    functional(index, bc_mask_pn, f_field_pn)
+
+                loader.declare_kernel(kernel)
+
+            return launcher
+
+        return _, container
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(self, bc_mask, f_field, stream=0):
+        grid = bc_mask.get_grid()
+        for level in range(grid.num_levels):
+            # Launch the neon container
+            c = self.neon_container(bc_mask, f_field, level)
+            c.run(stream, container_runtime=neon.Container.ContainerRuntime.neon)
+        return f_field
diff --git a/xlb/helper/simulation_manager.py b/xlb/helper/simulation_manager.py
index b639e58a..f6c7766b 100644
--- a/xlb/helper/simulation_manager.py
+++ b/xlb/helper/simulation_manager.py
@@ -17,9 +17,11 @@ def __init__(
         collision_type="BGK",
         forcing_scheme="exact_difference",
         force_vector=None,
+        initializer=None,
     ):
         super().__init__(grid, boundary_conditions, collision_type, forcing_scheme, force_vector)
 
+        self.initializer = initializer
         self.omega = omega
         self.count_levels = grid.count_levels
         # Create fields
@@ -38,7 +40,7 @@ def __init__(
         # self.u.export_vti(f"u_{fname_prefix}_topology.vti", 'u')
 
         # Prepare fields
-        self.f_0, self.f_1, self.bc_mask, self.missing_mask = self.prepare_fields(self.rho, self.u)
+        self.f_0, self.f_1, self.bc_mask, self.missing_mask = self.prepare_fields(self.rho, self.u, self.initializer)
         self.prepare_coalescence_count(coalescence_factor=self.coalescence_factor, bc_mask=self.bc_mask)
 
         # wp.synchronize()
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 56a864a5..44478199 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -67,10 +67,6 @@ def prepare_fields(self, rho, u, initializer=None):
         missing_mask = self.grid.create_field(cardinality=self.velocity_set.q, dtype=Precision.UINT8)
         bc_mask = self.grid.create_field(cardinality=1, dtype=Precision.UINT8)
 
-        from xlb.helper.initializers import initialize_multires_eq
-
-        f_0 = initialize_multires_eq(f_0, self.grid, self.velocity_set, self.precision_policy, self.compute_backend, rho=rho, u=u)
-
         for level in range(self.grid.count_levels):
             f_1.copy_from_run(level, f_0, 0)
         # f_0.update_host(0)
@@ -89,6 +85,15 @@ def prepare_fields(self, rho, u, initializer=None):
         # f_0.export_vti("init_f0.vti", 'init_f0')
         # missing_mask.export_vti("missing_mask.vti", 'missing_mask')
 
+        # Initialize distribution functions if initializer is provided
+        if initializer is not None:
+            # Refer to xlb.helper.initializers for available initializers
+            f_0 = initializer(bc_mask, f_0)
+        else:
+            from xlb.helper.initializers import initialize_multires_eq
+
+            f_0 = initialize_multires_eq(f_0, self.grid, self.velocity_set, self.precision_policy, self.compute_backend, rho=rho, u=u)
+
         return f_0, f_1, bc_mask, missing_mask
 
     def prepare_coalescence_count(self, coalescence_factor, bc_mask):

From 0825568f7a82a595d486ed75edb1829b06c2e2de Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 15 Jul 2025 12:35:04 -0400
Subject: [PATCH 115/208] Addressed PR review

---
 xlb/utils/mesher.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index ff33983b..54bf0515 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -154,7 +154,7 @@ def __init__(self, levels_data, scale=1, offset=(0.0, 0.0, 0.0)):
         assert coordinates.size != 0, "Error: No valid data to process. Check the input levels_data."
 
         # Merge duplicate points
-        coordinates, connectivity = self._merge_duplicates(coordinates, connectivity)
+        coordinates, connectivity = self._merge_duplicates(coordinates, connectivity, levels_data)
 
         # Apply scale and offset
         coordinates = self._transform_coordinates(coordinates, scale, offset)
@@ -331,7 +331,7 @@ def write_hdf5_file(self, filename, coordinates, connectivity, level_id_field, f
             for fname, fdata in field_data.items():
                 fg.create_dataset(fname, data=fdata.astype(np.float32), compression=compression, compression_opts=compression_opts)
 
-    def _merge_duplicates(self, coordinates, connectivity):
+    def _merge_duplicates(self, coordinates, connectivity, levels_data):
         # Merging duplicate points
         tolerance = 0.01
         chunk_size = 10_000_000  # Adjust based on GPU memory
@@ -340,13 +340,17 @@ def _merge_duplicates(self, coordinates, connectivity):
         mapping = np.zeros(num_points, dtype=np.int32)
         unique_idx = 0
 
+        # Get the grid shape of computational box at the finest level from the levels_data
+        num_levels = len(levels_data)
+        grid_shape_finest = np.array(levels_data[-1][0].shape) * 2 ** (num_levels - 1)
+
         for start in range(0, num_points, chunk_size):
             end = min(start + chunk_size, num_points)
             coords_chunk = coordinates[start:end]
 
             # Simple hashing: grid coordinates as tuple keys
             grid_coords = np.round(coords_chunk / tolerance).astype(np.int64)
-            hash_keys = grid_coords[:, 0] + grid_coords[:, 1] * 1_000_000 + grid_coords[:, 2] * 1_000_000_000_000
+            hash_keys = grid_coords[:, 0] + grid_coords[:, 1] * grid_shape_finest[0] + grid_coords[:, 2] * grid_shape_finest[0] * grid_shape_finest[1]
             unique_hash, inverse = np.unique(hash_keys, return_inverse=True)
             unique_hash, unique_indices, inverse = np.unique(hash_keys, return_index=True, return_inverse=True)
             unique_chunk = coords_chunk[unique_indices]

From 266d0b73fa4e5578933d75ad61762a54517841b6 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Wed, 16 Jul 2025 11:13:55 -0400
Subject: [PATCH 116/208] Generalized the MRES IO class and added a new method
 to export a 2D image at an arbitrary slice

---
 .../cuboid_flow_past_sphere_3d.py             |  18 +-
 xlb/utils/__init__.py                         |   2 +-
 xlb/utils/mesher.py                           | 156 ++++++++++++++++--
 3 files changed, 157 insertions(+), 19 deletions(-)

diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
index 827534c9..24d1a42b 100644
--- a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
@@ -135,10 +135,10 @@ def prepare_sparsity_pattern(level_data):
 
 
 # Define exporter object for hdf5 output
-from xlb.utils import ExportMultiresHDF5
+from xlb.utils import MultiresIO
 
 # Define an exporter for the multiresolution data
-h5exporter = ExportMultiresHDF5(level_data)
+exporter = MultiresIO(level_data)
 
 # Prepare the sparsity pattern and origins from the level data
 sparsity_pattern, level_origins = prepare_sparsity_pattern(level_data)
@@ -271,8 +271,20 @@ def print_lift_drag(sim):
         sim.macro(sim.f_0, sim.bc_mask, sim.rho, sim.u, streamId=0)
 
         # Call the exporter to save the current state
+        nx, ny, nz = grid_shape_finest
         filename = f"multires_flow_over_sphere_3d_{step:04d}"
-        h5exporter(filename, sim.u, sim.rho, compression="gzip", compression_opts=2)
+        exporter.to_hdf5(filename, sim.u, sim.rho, compression="gzip", compression_opts=2)
+        exporter.to_slice_image(
+            "velocity_x",
+            sim.u,
+            sim.rho,
+            plane_point=(nx // 2, ny // 2, nz // 2),
+            plane_normal=(0, 0, 1),
+            slice_thickness=1.0,
+            output_filename=f"{filename}_slice_x.png",
+            grid_res=256,
+            bounds=(0.4, 0.6, 0.4, 0.6),
+        )
 
         # Print lift and drag coefficients
         print_lift_drag(sim)
diff --git a/xlb/utils/__init__.py b/xlb/utils/__init__.py
index 51c97c4c..7af8f80c 100644
--- a/xlb/utils/__init__.py
+++ b/xlb/utils/__init__.py
@@ -7,4 +7,4 @@
     voxelize_stl,
     axangle2mat,
 )
-from .mesher import make_cuboid_mesh, ExportMultiresHDF5
+from .mesher import make_cuboid_mesh, MultiresIO
diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index 54bf0515..0059632d 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -125,27 +125,19 @@ def make_cuboid_mesh(voxel_size, cuboids, stl_filename):
     return list(reversed(level_data))
 
 
-class ExportMultiresHDF5(object):
+class MultiresIO(object):
     def __init__(self, levels_data, scale=1, offset=(0.0, 0.0, 0.0)):
         """
-        Initialize the ExportMultiresHDF5 object.
+        Initialize the MultiresIO object.
 
         Parameters
         ----------
         levels_data : list of tuples
             Each tuple contains (data, voxel_size, origin, level).
-        filename : str
-            The name of the output HDF5 file.
-        fields : dict, optional
-            A dictionary of fields to be included in the HDF5 file.
         scale : float or tuple, optional
             Scale factor for the coordinates.
         offset : tuple, optional
             Offset to be applied to the coordinates.
-        compression : str, optional
-            Compression method for the HDF5 datasets.
-        compression_opts : int, optional
-            Compression options for the HDF5 datasets.
         """
         # Process the multires geometry and extract coordinates and connectivity in the coordinate system of the finest level
         coordinates, connectivity, level_id_field, total_cells = self.process_geometry(levels_data, scale)
@@ -296,7 +288,7 @@ def save_xdmf(self, h5_filename, xmf_filename, total_cells, num_points, fields={
         print("\tXDMF file written successfully")
         return
 
-    def write_hdf5_file(self, filename, coordinates, connectivity, level_id_field, field_data, compression="gzip", compression_opts=0):
+    def save_hdf5_file(self, filename, coordinates, connectivity, level_id_field, field_data, compression="gzip", compression_opts=0):
         """Write the processed mesh data to an HDF5 file.
         Parameters
         ----------
@@ -438,9 +430,10 @@ def kernel(index: Any):
 
         return container
 
-    def __call__(self, filename, velocity_neon, density_neon, compression="gzip", compression_opts=0, store_precision=None):
-        import time
-
+    def get_fields_data(self, velocity_neon, density_neon):
+        """
+        Extracts and prepares the fields data from the NEON fields for export.
+        """
         # Ensure that this operator is called on multires grids
         grid_mres = velocity_neon.get_grid()
         assert grid_mres.get_name() == "mGrid", f"Operation {self.__class__.__name} is only applicable to multi-resolution cases"
@@ -478,12 +471,145 @@ def __call__(self, filename, velocity_neon, density_neon, compression="gzip", co
             fields_data[field_name] = np.concatenate(fields_data[field_name])
             assert fields_data[field_name].size == self.total_cells, f"Error: Field {field_name} size mismatch!"
 
+        return fields_data
+
+    def to_hdf5(self, filename, velocity_neon, density_neon, compression="gzip", compression_opts=0, store_precision=None):
+        """
+        Export the multi-resolution mesh data to an HDF5 file.
+        Parameters
+        ----------
+        filename : str
+            The name of the output HDF5 file (without extension).
+        velocity_neon : neon mGrid Field
+            The NEON field containing velocity data.
+        density_neon : neon mGrid Field
+            The NEON field containing density data.
+        compression : str, optional
+            The compression method to use for the HDF5 file.
+        compression_opts : int, optional
+            The compression options to use for the HDF5 file.
+        store_precision : str, optional
+            The precision policy for storing data in the HDF5 file.
+        """
+        import time
+
+        # Get the fields data from the NEON fields
+        fields_data = self.get_fields_data(velocity_neon, density_neon)
+
         # Save XDMF file
         self.save_xdmf(filename + ".h5", filename + ".xmf", self.total_cells, len(self.coordinates), fields_data)
 
         # Writing HDF5 file
         print("\tWriting HDF5 file")
         tic_write = time.perf_counter()
-        self.write_hdf5_file(filename, self.coordinates, self.connectivity, self.level_id_field, fields_data, compression, compression_opts)
+        self.save_hdf5_file(filename, self.coordinates, self.connectivity, self.level_id_field, fields_data, compression, compression_opts)
         toc_write = time.perf_counter()
         print(f"\tHDF5 file written in {toc_write - tic_write:0.1f} seconds")
+
+    def to_slice_image(
+        self,
+        field_name,
+        velocity_neon,
+        density_neon,
+        plane_point,
+        plane_normal,
+        slice_thickness,
+        output_filename,
+        bounds=[0, 1, 0, 1],
+        grid_res=512,
+        cmap=None,
+    ):
+        """
+        Export an arbitrary-plane slice from unstructured point data to PNG.
+
+        Parameters
+        ----------
+        field_name : str
+            The field to plot.
+        plane_point : array_like
+            A point [x, y, z] on the plane.
+        plane_normal : array_like
+            Plane normal vector [nx, ny, nz].
+        slice_thickness : float
+            How thick (in units of the coordinate system) the slice should be.
+        output_filename : str
+            Output PNG filename (without extension).
+        grid_resolution : tuple
+            Resolution of output image (pixels in plane u, v directions).
+        grid_size : tuple
+            Physical size of slice grid (width, height).
+        cmap : str
+            Matplotlib colormap.
+        """
+        from matplotlib import cm
+        import numpy as np
+        import matplotlib.pyplot as plt
+        from scipy.interpolate import griddata
+
+        # Get the fields data from the NEON fields
+        fields_data = self.get_fields_data(velocity_neon, density_neon)
+        cell_values = fields_data[field_name]
+
+        # get the normalized plane normal
+        plane_normal = np.asarray(plane_normal)
+        n = plane_normal / np.linalg.norm(plane_normal)
+
+        # Compute centroids (K = 8 for hexahedral cells)
+        cell_points = self.coordinates[self.connectivity]  # shape (M, K, 3)
+        centroids = np.mean(cell_points, axis=1)  # (M, 3)
+
+        # Compute signed distances of each cell center to the plane
+        plane_point *= plane_normal
+        sdf = np.dot(centroids - plane_point, n)
+
+        # Filter: cells with centroid near plane
+        mask = np.abs(sdf) <= slice_thickness / 2
+        if not np.any(mask):
+            raise ValueError("No cells intersect the plane within thickness.")
+
+        # Project centroids to plane
+        centroids_slice = centroids[mask]
+        sdf_slice = sdf[mask]
+        proj = centroids_slice - np.outer(sdf_slice, n)
+
+        values = cell_values[mask]
+
+        # Build in-plane basis
+        if np.allclose(n, [1, 0, 0]):
+            u1 = np.array([0, 1, 0])
+        else:
+            u1 = np.array([1, 0, 0])
+        u2 = np.cross(n, u1)
+
+        local_x = np.dot(proj - plane_point, u1)
+        local_y = np.dot(proj - plane_point, u2)
+
+        # Define extent of the plot
+        xmin, xmax, ymin, ymax = local_x.min(), local_x.max(), local_y.min(), local_y.max()
+        Lx = xmax - xmin
+        Ly = ymax - ymin
+        extent = np.array([xmin + bounds[0] * Lx, xmin + bounds[1] * Lx, ymin + bounds[2] * Ly, ymin + bounds[3] * Ly])
+        mask_bounds = (extent[0] <= local_x) & (local_x <= extent[1]) & (extent[2] <= local_y) & (local_y <= extent[3])
+
+        if cmap is None:
+            cmap = cm.nipy_spectral
+
+        # Rasterize: scatter cell centers to 2D grid
+        grid_x = np.linspace(local_x[mask_bounds].min(), local_x[mask_bounds].max(), grid_res)
+        grid_y = np.linspace(local_y[mask_bounds].min(), local_y[mask_bounds].max(), grid_res)
+        xv, yv = np.meshgrid(grid_x, grid_y, indexing="xy")
+
+        # Linear interpolation for each grid point
+        grid_field = griddata(points=(local_x, local_y), values=values, xi=(xv, yv), method="linear", fill_value=np.nan)
+
+        # Plot
+        plt.imshow(
+            grid_field,
+            extent=[xmin, xmax, ymin, ymax],
+            cmap=cmap,
+            origin="lower",
+            aspect="equal",
+        )
+        plt.colorbar(label=field_name)
+        plt.savefig(output_filename + ".png", dpi=300, bbox_inches="tight")
+        plt.close()

From 70c5ba54364789eab1a16d1e97228a7a8f272f88 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Wed, 16 Jul 2025 17:06:26 -0400
Subject: [PATCH 117/208] removed the redundant stl sphere example and
 corrected the sphere indices example.

---
 .../cuboid_flow_past_sphere_3d.py             |   1 +
 .../grid_refinement/flow_past_sphere_3d.py    |  42 +++-
 .../stl_flow_past_sphere_3d.py                | 225 ------------------
 3 files changed, 35 insertions(+), 233 deletions(-)
 delete mode 100644 examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py

diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
index 24d1a42b..44abbe47 100644
--- a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
@@ -268,6 +268,7 @@ def print_lift_drag(sim):
         # print(f"\tVTK file written in {toc_write - tic_write:0.1f} seconds")
 
         # Call the Macroscopic operator to compute macroscopic fields
+        wp.synchronize()
         sim.macro(sim.f_0, sim.bc_mask, sim.rho, sim.u, streamId=0)
 
         # Call the exporter to save the current state
diff --git a/examples/cfd/grid_refinement/flow_past_sphere_3d.py b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
index 1d76c5fa..cfe5fe08 100644
--- a/examples/cfd/grid_refinement/flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
@@ -35,23 +35,48 @@
 level_list = []
 for lvl in range(num_levels):
     divider = 2**lvl
-    growth = 1.5**lvl
-    shape = grid_shape[0] // divider, grid_shape[1] // divider, grid_shape[2] // divider
+    growth = 1.25**lvl
+    shape = nx // divider, ny // divider, nz // divider
     if lvl == num_levels - 1:
         level = np.ascontiguousarray(np.ones(shape, dtype=int), dtype=np.int32)
-        box_origin = (0, 0, 0)  # The coarsest level has no origin offset
+        box_origin = np.array([0, 0, 0])  # The coarsest level has no origin offset
     else:
-        box_size = tuple([int(shape[i] // 4 * growth) for i in range(3)])
-        box_origin = tuple([sphere_origin[0] // divider - 4 * sphere_radius // divider] + [shape[i] // 2 - box_size[i] // 2 for i in range(1, 3)])
+        box_size = tuple([int(0.3 * shape[i] * growth) for i in range(3)])
         level = np.ascontiguousarray(np.ones(box_size, dtype=int), dtype=np.int32)
+        if lvl == 0:
+            box_origin = tuple(
+                [sphere_origin[0] // divider - int(2 * growth * sphere_radius // divider)] + [shape[i] // 2 - box_size[i] // 2 for i in range(1, 3)]
+            )
+        else:
+            finer_box_size = level_list[-1].shape
+            finer_box_origin = np.array(level_origins[-1])
+            shift = np.array(box_size) - np.array(finer_box_size) // 2
+            box_origin = finer_box_origin // 2 - shift // 2
     level_list.append(level)
-    level_origins.append(neon.Index_3d(*box_origin))
+    level_origins.append(box_origin)
 
+
+# Note that this exporter does not produce expected results at the moment because the level_list
+# produced above include dense fields and are not sparse.
+
+# # Define exporter object for hdf5 output
+# from xlb.utils import MultiresIO
+
+# # Pack the needed information for the exporter in a list called "level_data"
+# level_data = []
+# for level in range(num_levels):
+#     voxel_size = 2**level
+#     level_data.append(
+#         [level_list[level].astype(bool), voxel_size, level_origins[level], level],
+#     )
+# exporter = MultiresIO(level_data)
+
+# Create the multires grid
 grid = multires_grid_factory(
     grid_shape,
     velocity_set=velocity_set,
     sparsity_pattern_list=level_list,
-    sparsity_pattern_origins=level_origins,
+    sparsity_pattern_origins=[neon.Index_3d(*origin) for origin in level_origins],
 )
 
 # Define Boundary Indices
@@ -138,7 +163,8 @@ def bc_profile_warp(index: wp.vec3i):
     sim.step()
 
     if step % post_process_interval == 0 or step == num_steps - 1:
-        # TODO: Issues in the vtk output for rectangular cuboids (as if a duboid grid with the largest side is assumed)
+        # TODO: Issues in the vtk output for rectangular cuboids (as if a cuboid grid with the largest side is assumed)
+        wp.synchronize()
         sim.export_macroscopic("multires_flow_over_sphere_3d_")
         wp.synchronize()
         end_time = time.time()
diff --git a/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
deleted file mode 100644
index a840ae23..00000000
--- a/examples/cfd/grid_refinement/stl_flow_past_sphere_3d.py
+++ /dev/null
@@ -1,225 +0,0 @@
-import neon
-import warp as wp
-import numpy as np
-import time
-
-import xlb
-from xlb.compute_backend import ComputeBackend
-from xlb.precision_policy import PrecisionPolicy
-from xlb.grid import multires_grid_factory
-from xlb.operator.stepper import MultiresIncompressibleNavierStokesStepper
-from xlb.operator.boundary_condition import (
-    FullwayBounceBackBC,
-    HalfwayBounceBackBC,
-    RegularizedBC,
-    ExtrapolationOutflowBC,
-    DoNothingBC,
-    ZouHeBC,
-    HybridBC,
-)
-from xlb.utils import make_cuboid_mesh
-from xlb.operator.boundary_masker import MeshVoxelizationMethod
-from xlb.operator.force import MultiresMomentumTransfer
-
-
-def generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part, grid_shape):
-    """
-    Generate a cuboid mesh based on the provided voxel size and domain multipliers.
-    """
-    import trimesh
-
-    # STL position
-    nx, ny, nz = grid_shape
-    sphere_origin = (nx // 6, ny // 2, nz // 2)
-
-    # Load the mesh
-    mesh = trimesh.load_mesh(stl_filename, process=False)
-    assert not mesh.is_empty, ValueError("Loaded mesh is empty or invalid.")
-    mesh_vertices = mesh.vertices
-
-    # Find voxel size and sphere radius
-    min_bound = mesh_vertices.min(axis=0)
-    max_bound = mesh_vertices.max(axis=0)
-    mesh_extents = max_bound - min_bound
-    sphere_diameter_phys_units = float(min(mesh_extents))
-    voxel_size = sphere_diameter_phys_units / num_finest_voxels_across_part
-    sphere_radius = sphere_diameter_phys_units / voxel_size / 2.0
-
-    # Compute translation to put mesh into first octant of that domain
-    shift = np.array(sphere_origin) * voxel_size - sphere_diameter_phys_units / 2.0 - min_bound
-
-    # Apply translation and save out temp stl
-    mesh_vertices = mesh_vertices + shift
-    mesh_vertices = np.asarray(mesh_vertices) / voxel_size
-
-    # Mesh based on temp stl
-    # Create the multires grid
-    num_levels = 3
-    level_origins = []
-    level_data = []
-    for lvl in range(num_levels):
-        divider = 2**lvl
-        growth = 1.25**lvl
-        shape = nx // divider, ny // divider, nz // divider
-        if lvl == num_levels - 1:
-            level = np.ascontiguousarray(np.ones(shape, dtype=int), dtype=np.int32)
-            box_origin = (0, 0, 0)  # The coarsest level has no origin offset
-        else:
-            box_size = tuple([int(0.3 * shape[i] * growth) for i in range(3)])
-            if lvl == 0:
-                box_origin = tuple(
-                    [sphere_origin[0] // divider - int(2 * growth * sphere_radius // divider)]
-                    + [shape[i] // 2 - box_size[i] // 2 for i in range(1, 3)]
-                )
-            else:
-                finer_box_size = level_data[-1].shape
-                finer_box_origin = np.array(level_origins[-1])
-                shift = np.array(box_size) - np.array(finer_box_size) // 2
-                box_origin = finer_box_origin // 2 - shift // 2
-            level = np.ascontiguousarray(np.ones(box_size, dtype=int), dtype=np.int32)
-        level_data.append(level)
-        level_origins.append(box_origin)
-
-    return level_data, level_origins, mesh_vertices, sphere_radius
-
-
-# -------------------------- Simulation Setup --------------------------
-
-# The following parameters define the resolution of the voxelized grid
-num_finest_voxels_across_part = 10
-
-# Other setup parameters
-Re = 500.0
-grid_shape = (512 // 2, 128 // 2, 128 // 2)
-compute_backend = ComputeBackend.NEON
-precision_policy = PrecisionPolicy.FP32FP32
-velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=compute_backend)
-u_max = 0.04
-num_steps = 10000
-post_process_interval = 1000
-
-# Initialize XLB
-xlb.init(
-    velocity_set=velocity_set,
-    default_backend=compute_backend,
-    default_precision_policy=precision_policy,
-)
-
-# Generate the cuboid mesh and sphere vertices
-stl_filename = "examples/cfd/stl-files/sphere.stl"
-level_data, level_origins, sphere, sphere_radius = generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part, grid_shape)
-
-# get the number of levels
-num_levels = len(level_data)
-
-# Create the multires grid
-grid = multires_grid_factory(
-    grid_shape,
-    velocity_set=velocity_set,
-    sparsity_pattern_list=level_data,
-    sparsity_pattern_origins=[neon.Index_3d(*box_origin) for box_origin in level_origins],
-)
-
-# Define Boundary Indices
-coarsest_level = grid.count_levels - 1
-box = grid.bounding_box_indices(shape=grid.level_to_shape(coarsest_level))
-box_no_edge = grid.bounding_box_indices(shape=grid.level_to_shape(coarsest_level), remove_edges=True)
-inlet = box_no_edge["left"]
-outlet = box_no_edge["right"]
-walls = [box["bottom"][i] + box["top"][i] + box["front"][i] + box["back"][i] for i in range(velocity_set.d)]
-walls = np.unique(np.array(walls), axis=-1).tolist()
-
-
-# Define Boundary Conditions
-def bc_profile():
-    assert compute_backend == ComputeBackend.NEON
-
-    # Note nx, ny, nz are the dimensions of the grid at the finest level while the inlet is defined at the coarsest level
-    _, ny, nz = grid_shape
-    dtype = precision_policy.compute_precision.wp_dtype
-    H_y = dtype(ny // 2 ** (num_levels - 1) - 1)  # Height in y direction
-    H_z = dtype(nz // 2 ** (num_levels - 1) - 1)  # Height in z direction
-    two = dtype(2.0)
-    u_max_wp = dtype(u_max)
-
-    @wp.func
-    def bc_profile_warp(index: wp.vec3i):
-        # Poiseuille flow profile: parabolic velocity distribution
-        y = dtype(index[1])
-        z = dtype(index[2])
-
-        # Calculate normalized distance from center
-        y_center = y - (H_y / two)
-        z_center = z - (H_z / two)
-        r_squared = (two * y_center / H_y) ** two + (two * z_center / H_z) ** two
-
-        # Parabolic profile: u = u_max * (1 - r²)
-        return wp.vec(u_max_wp * wp.max(dtype(0.0), dtype(1.0) - r_squared), length=1)
-
-    return bc_profile_warp
-
-
-# Convert bc indices to a list of list (first entry corresponds to the finest level)
-inlet = [[] for _ in range(num_levels - 1)] + [inlet]
-outlet = [[] for _ in range(num_levels - 1)] + [outlet]
-walls = [[] for _ in range(num_levels - 1)] + [walls]
-
-# Initialize Boundary Conditions
-bc_left = RegularizedBC("velocity", profile=bc_profile(), indices=inlet)
-# Alternatively, use a prescribed velocity profile
-# bc_left = RegularizedBC("velocity", prescribed_value=(u_max, 0.0, 0.0), indices=inlet)
-bc_walls = FullwayBounceBackBC(indices=walls)  # TODO: issues with halfway bounce back only here!
-# bc_outlet = ExtrapolationOutflowBC(indices=outlet)
-bc_outlet = DoNothingBC(indices=outlet)
-bc_sphere = HalfwayBounceBackBC(mesh_vertices=sphere, voxelization_method=MeshVoxelizationMethod.AABB)
-# bc_sphere = HybridBC(
-#     bc_method="nonequilibrium_regularized", mesh_vertices=sphere, voxelization_method=MeshVoxelizationMethod.AABB, use_mesh_distance=False
-# )
-
-boundary_conditions = [bc_walls, bc_left, bc_outlet, bc_sphere]
-
-# Configure the simulation relaxation time
-visc = u_max * num_finest_voxels_across_part / Re
-omega = 1.0 / (3.0 * visc + 0.5)
-
-# Define a multi-resolution simulation manager
-sim = xlb.helper.MultiresSimulationManager(
-    omega=omega,
-    grid=grid,
-    boundary_conditions=boundary_conditions,
-    collision_type="KBC",
-)
-
-# Setup Momentum Transfer for Force Calculation
-bc_sphre = boundary_conditions[-1]
-momentum_transfer = MultiresMomentumTransfer(bc_sphere, compute_backend=compute_backend)
-
-
-def print_lift_drag(sim):
-    # Compute lift and drag
-    boundary_force = momentum_transfer(sim.f_0, sim.f_1, sim.bc_mask, sim.missing_mask)
-    drag = boundary_force[0]  # x-direction
-    lift = boundary_force[2]
-    sphere_cross_section = np.pi * sphere_radius**2
-    u_avg = 0.5 * u_max
-    cd = 2.0 * drag / (u_avg**2 * sphere_cross_section)
-    cl = 2.0 * lift / (u_avg**2 * sphere_cross_section)
-    print(f"CD={cd}, CL={cl}")
-
-
-# -------------------------- Simulation Loop --------------------------
-
-wp.synchronize()
-start_time = time.time()
-for step in range(num_steps):
-    sim.step()
-
-    if step % post_process_interval == 0 or step == num_steps - 1:
-        # TODO: Issues in the vtk output for rectangular cuboids (as if a duboid grid with the largest side is assumed)
-        sim.export_macroscopic("multires_flow_over_sphere_3d_")
-        print_lift_drag(sim)
-        wp.synchronize()
-        end_time = time.time()
-        elapsed = end_time - start_time
-        print(f"Completed step {step}. Time elapsed for {post_process_interval} steps: {elapsed:.6f} seconds.")
-        start_time = time.time()

From 853aefff271ced2d7a0eb588861e89d69dc94a08 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Wed, 16 Jul 2025 17:07:00 -0400
Subject: [PATCH 118/208] Enabled auto chunking in the exporter

---
 xlb/utils/mesher.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index 0059632d..e323c644 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -310,18 +310,18 @@ def save_hdf5_file(self, filename, coordinates, connectivity, level_id_field, fi
         import h5py
 
         with h5py.File(filename + ".h5", "w") as f:
-            f.create_dataset("/Mesh/Points", data=coordinates, compression=compression, compression_opts=compression_opts, chunks=(100000, 3))
+            f.create_dataset("/Mesh/Points", data=coordinates, compression=compression, compression_opts=compression_opts, chunks=True)
             f.create_dataset(
                 "/Mesh/Connectivity",
                 data=connectivity,
                 compression=compression,
                 compression_opts=compression_opts,
-                chunks=(30000, 8),
+                chunks=True,
             )
             f.create_dataset("/Mesh/Level", data=level_id_field, compression=compression, compression_opts=compression_opts)
             fg = f.create_group("/Fields")
             for fname, fdata in field_data.items():
-                fg.create_dataset(fname, data=fdata.astype(np.float32), compression=compression, compression_opts=compression_opts)
+                fg.create_dataset(fname, data=fdata.astype(np.float32), compression=compression, compression_opts=compression_opts, chunks=True)
 
     def _merge_duplicates(self, coordinates, connectivity, levels_data):
         # Merging duplicate points

From 215d9938a9f406d5b4523f548a5dbaea3b73b85a Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Wed, 16 Jul 2025 23:21:51 +0200
Subject: [PATCH 119/208] Fix: reducing memory usage on CPU for mres.

---
 xlb/grid/multires_grid.py                    |  2 ++
 xlb/operator/stepper/nse_multires_stepper.py | 13 ++++++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/xlb/grid/multires_grid.py b/xlb/grid/multires_grid.py
index b86fed2c..f265fd1e 100644
--- a/xlb/grid/multires_grid.py
+++ b/xlb/grid/multires_grid.py
@@ -74,11 +74,13 @@ def create_field(
         cardinality: int,
         dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None,
         fill_value=None,
+        neon_data_use: neon.DataUse = neon.DataUse.host_device(),
     ):
         dtype = dtype.wp_dtype if dtype else DefaultConfig.default_precision_policy.store_precision.wp_dtype
         field = self.grid.new_field(
             cardinality=cardinality,
             dtype=dtype,
+            data_use=neon_data_use,
         )
         for l in range(self.count_levels):
             if fill_value is None:
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 56a864a5..88b7322a 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -62,14 +62,21 @@ def prepare_fields(self, rho, u, initializer=None):
                 - missing_mask: Mask indicating which populations are missing at boundary nodes
         """
 
-        f_0 = self.grid.create_field(cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision)
-        f_1 = self.grid.create_field(cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision)
+        f_0 = self.grid.create_field(cardinality=self.velocity_set.q,
+                                     dtype=self.precision_policy.store_precision,
+                                     neon_data_use=neon.DataUse.host_device())
+
+        f_1 = self.grid.create_field(cardinality=self.velocity_set.q,
+                                     dtype=self.precision_policy.store_precision,
+                                     neon_data_use=neon.DataUse.host_device())
+
         missing_mask = self.grid.create_field(cardinality=self.velocity_set.q, dtype=Precision.UINT8)
         bc_mask = self.grid.create_field(cardinality=1, dtype=Precision.UINT8)
 
         from xlb.helper.initializers import initialize_multires_eq
 
-        f_0 = initialize_multires_eq(f_0, self.grid, self.velocity_set, self.precision_policy, self.compute_backend, rho=rho, u=u)
+        f_0 = initialize_multires_eq(f_0, self.grid, self.velocity_set,
+                                     self.precision_policy, self.compute_backend, rho=rho, u=u)
 
         for level in range(self.grid.count_levels):
             f_1.copy_from_run(level, f_0, 0)

From 5f0950653234b9ba14c1ab438f6969f45c226853 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 17 Jul 2025 11:12:31 +0200
Subject: [PATCH 120/208] Perf: fusion at the finest level for mres.

- OptimizationType class to manage different optimization for the mres LBM
- Fused neon kernel for finest level of mres
---
 xlb/__init__.py                              |   2 +-
 xlb/helper/simulation_manager.py             | 184 +++++++++++--------
 xlb/operator/stepper/nse_multires_stepper.py | 153 +++++++++++++--
 xlb/optimization_type.py                     |  71 +++++++
 4 files changed, 318 insertions(+), 92 deletions(-)
 create mode 100644 xlb/optimization_type.py

diff --git a/xlb/__init__.py b/xlb/__init__.py
index b58db3bb..7009237e 100644
--- a/xlb/__init__.py
+++ b/xlb/__init__.py
@@ -2,7 +2,7 @@
 from xlb.compute_backend import ComputeBackend as ComputeBackend
 from xlb.precision_policy import PrecisionPolicy as PrecisionPolicy, Precision as Precision
 from xlb.physics_type import PhysicsType as PhysicsType
-
+from xlb.optimization_type import OptimizationType as OptimizationType
 # Config
 from .default_config import init as init, DefaultConfig as DefaultConfig
 
diff --git a/xlb/helper/simulation_manager.py b/xlb/helper/simulation_manager.py
index f6c7766b..7213d14e 100644
--- a/xlb/helper/simulation_manager.py
+++ b/xlb/helper/simulation_manager.py
@@ -2,6 +2,7 @@
 import warp as wp
 from xlb.operator.stepper import MultiresIncompressibleNavierStokesStepper
 from xlb.operator.macroscopic import MultiresMacroscopic
+from xlb.optimization_type import OptimizationType
 
 
 class MultiresSimulationManager(MultiresIncompressibleNavierStokesStepper):
@@ -10,46 +11,38 @@ class MultiresSimulationManager(MultiresIncompressibleNavierStokesStepper):
     """
 
     def __init__(
-        self,
-        omega,
-        grid,
-        boundary_conditions=[],
-        collision_type="BGK",
-        forcing_scheme="exact_difference",
-        force_vector=None,
-        initializer=None,
+            self,
+            omega,
+            grid,
+            boundary_conditions=[],
+            collision_type="BGK",
+            forcing_scheme="exact_difference",
+            force_vector=None,
+            initializer=None,
+            optimization_type: OptimizationType = OptimizationType.NAIVE_COLLIDE_STREAM,
     ):
         super().__init__(grid, boundary_conditions, collision_type, forcing_scheme, force_vector)
 
         self.initializer = initializer
         self.omega = omega
         self.count_levels = grid.count_levels
+        self.optimization_type = optimization_type
         # Create fields
         self.rho = grid.create_field(cardinality=1, dtype=self.precision_policy.store_precision)
         self.u = grid.create_field(cardinality=3, dtype=self.precision_policy.store_precision)
-        self.coalescence_factor = grid.create_field(cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision)
+        self.coalescence_factor = grid.create_field(cardinality=self.velocity_set.q,
+                                                    dtype=self.precision_policy.store_precision)
 
         for level in range(self.count_levels):
             self.u.fill_run(level, 0.0, 0)
             self.rho.fill_run(level, 1.0, 0)
             self.coalescence_factor.fill_run(level, 0.0, 0)
 
-        # wp.synchronize()
-        # self.u.update_host(0)
-        # wp.synchronize()
-        # self.u.export_vti(f"u_{fname_prefix}_topology.vti", 'u')
-
         # Prepare fields
         self.f_0, self.f_1, self.bc_mask, self.missing_mask = self.prepare_fields(self.rho, self.u, self.initializer)
         self.prepare_coalescence_count(coalescence_factor=self.coalescence_factor, bc_mask=self.bc_mask)
 
-        # wp.synchronize()
-        # self.u.update_host(0)
-        # wp.synchronize()
-        # self.u.export_vti(f"u_t2_{fname_prefix}_topology.vti", 'u')
-
         self.iteration_idx = -1
-
         self.macro = MultiresMacroscopic(
             compute_backend=self.compute_backend,
             precision_policy=self.precision_policy,
@@ -79,7 +72,7 @@ def step(self):
     def _construct_stepper_skeleton(self):
         self.app = []
 
-        def recursion(level, app):
+        def recursion_reference(level, app):
             if level < 0:
                 return
             print(f"RECURSION down to level {level}")
@@ -96,21 +89,9 @@ def recursion(level, app):
                 omega=self.omega,
                 timestep=0,
             )
-            # if(level == 0):
-            #     wp.synchronize()
-            #     self.f_0.update_host(0)
-            #     self.f_1.update_host(0)
-            #     wp.synchronize()
-            #     self.f_0.export_vti(f"pop_0_", "pop_0")
-            #     self.f_1.export_vti(f"pop_1_", "pop_1")
-            #     # exit
-            #     import sys
-            #     print("exit")
-            #     #sys.exit()
-            #     pass
-
-            recursion(level - 1, app)
-            recursion(level - 1, app)
+
+            recursion_reference(level - 1, app)
+            recursion_reference(level - 1, app)
 
             # Important: swapping of f_0 and f_1 is done here
             print(f"RECURSION Level {level}, stream_coarse_step_ABC")
@@ -125,47 +106,96 @@ def recursion(level, app):
                 omega=self.coalescence_factor,
                 timestep=0,
             )
-            # print(f"RECURSION Level {level}, stream_coarse_step_B")
-            #
-            # self.add_to_app(
-            #     app=app,
-            #     op_name="stream_coarse_step_B",
-            #     mres_level=level,
-            #     f_0=self.f_1,
-            #     f_1=self.f_0,
-            #     bc_mask=self.bc_mask,
-            #     missing_mask=self.missing_mask,
-            #     omega=self.coalescence_factor,
-            #     timestep=0,
-            # )
-
-            # print(f"RECURSION Level {level}, stream_coarse_step_C")
-            #
-            # self.add_to_app(
-            #     app=app,
-            #     op_name="stream_coarse_step_C",
-            #     mres_level=level,
-            #     f_0=self.f_1,
-            #     f_1=self.f_0,
-            #     bc_mask=self.bc_mask,
-            #     missing_mask=self.missing_mask,
-            #     omega=self.omega,
-            #     timestep=0,
-            # )
-            # if(level == 1):
-            #     wp.synchronize()
-            #     self.f_0.update_host(0)
-            #     self.f_1.update_host(0)
-            #     wp.synchronize()
-            #     self.f_0.export_vti(f"pop_0_qq", "pop_0")
-            #     self.f_1.export_vti(f"pop_1_qq", "pop_1")
-            #     # exit
-            #     import sys
-            #     print("exit")
-            #     sys.exit()
-            #     pass
-
-        recursion(self.count_levels - 1, app=self.app)
+
+        def recursion_fused_finest(level,
+                                   app,
+                                   is_self_f1_the_explosion_src_field,
+                                   is_self_f1_the_coalescence_dst_field):
+            if level < 0:
+                return
+
+            if level == 0:
+                print(f"RECURSION down to the finest level {level}")
+                print(f"RECURSION Level {level}, Fused STREAM and COLLIDE")
+                self.add_to_app(
+                    app=app,
+                    op_name="finest_fused_pull",
+                    mres_level=level,
+                    f_0=self.f_0,
+                    f_1=self.f_1,
+                    bc_mask=self.bc_mask,
+                    missing_mask=self.missing_mask,
+                    omega=self.omega,
+                    timestep=0,
+                    is_f1_the_explosion_src_field=is_self_f1_the_explosion_src_field,
+                    is_f1_the_coalescence_dst_field=is_self_f1_the_coalescence_dst_field,
+                )
+                self.add_to_app(
+                    app=app,
+                    op_name="finest_fused_pull",
+                    mres_level=level,
+                    f_0=self.f_1,
+                    f_1=self.f_0,
+                    bc_mask=self.bc_mask,
+                    missing_mask=self.missing_mask,
+                    omega=self.omega,
+                    timestep=0,
+                    is_f1_the_explosion_src_field=not is_self_f1_the_explosion_src_field,
+                    is_f1_the_coalescence_dst_field=not is_self_f1_the_coalescence_dst_field,
+                )
+                return
+
+            print(f"RECURSION down to level {level}")
+            print(f"RECURSION Level {level}, COLLIDE")
+
+            self.add_to_app(
+                app=app,
+                op_name="collide_coarse",
+                mres_level=level,
+                f_0=self.f_0,
+                f_1=self.f_1,
+                bc_mask=self.bc_mask,
+                missing_mask=self.missing_mask,
+                omega=self.omega,
+                timestep=0,
+            )
+            # 1. Accumulation is read from f_0 in the streaming step, where f_0=self.f_1.
+            # so is_self_f1_the_coalescence_dst_field is True
+            # 2. Explision data is the output from the corser collide, which is f_1=self.f_1.
+            # so is_self_f1_the_explosion_src_field is True
+
+            if level - 1 == 0:
+                recursion_fused_finest(level - 1, app, is_self_f1_the_explosion_src_field=True,
+                                       is_self_f1_the_coalescence_dst_field=True)
+            else:
+                recursion_fused_finest(level - 1, app, is_self_f1_the_explosion_src_field=None,
+                                       is_self_f1_the_coalescence_dst_field=None)
+                recursion_fused_finest(level - 1, app, is_self_f1_the_explosion_src_field=None,
+                                       is_self_f1_the_coalescence_dst_field=None)
+            # Important: swapping of f_0 and f_1 is done here
+            print(f"RECURSION Level {level}, stream_coarse_step_ABC")
+            self.add_to_app(
+                app=app,
+                op_name="stream_coarse_step_ABC",
+                mres_level=level,
+                f_0=self.f_1,
+                f_1=self.f_0,
+                bc_mask=self.bc_mask,
+                missing_mask=self.missing_mask,
+                omega=self.coalescence_factor,
+                timestep=0,
+            )
+
+        if self.optimization_type == OptimizationType.NAIVE_COLLIDE_STREAM:
+            recursion_reference(self.count_levels - 1, app=self.app)
+        elif self.optimization_type == OptimizationType.FUSION_AT_FINEST:
+            recursion_fused_finest(self.count_levels - 1,
+                                   app=self.app,
+                                   is_self_f1_the_coalescence_dst_field=None,
+                                   is_self_f1_the_explosion_src_field=None)
+        else:
+            raise ValueError(f"Unknown optimization level: {self.opt_level}")
+
         bk = self.grid.get_neon_backend()
         self.sk = neon.Skeleton(backend=bk)
         self.sk.sequence("mres_nse_stepper", self.app)
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 4b694648..33593315 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -1,5 +1,5 @@
 # Base class for all multires stepper operators
-
+import nvtx
 import warp as wp
 import neon
 from typing import Any
@@ -75,21 +75,11 @@ def prepare_fields(self, rho, u, initializer=None):
 
         for level in range(self.grid.count_levels):
             f_1.copy_from_run(level, f_0, 0)
-        # f_0.update_host(0)
-        # wp.synchronize()
-        # f_0.export_vti("f0_eq_init.vti", "init_f0")
 
         # Process boundary conditions and update masks
         f_1, bc_mask, missing_mask = self._process_boundary_conditions(self.boundary_conditions, f_1, bc_mask, missing_mask)
         # Initialize auxiliary data if needed
         f_1 = self._initialize_auxiliary_data(self.boundary_conditions, f_1, bc_mask, missing_mask)
-        # bc_mask.update_host(0)
-        # bc_mask.update_host(0)
-        # f_0.update_host(0)
-        # wp.synchronize()
-        # bc_mask.export_vti("bc_mask.vti", "bc_mask")
-        # f_0.export_vti("init_f0.vti", 'init_f0')
-        # missing_mask.export_vti("missing_mask.vti", 'missing_mask')
 
         # Initialize distribution functions if initializer is provided
         if initializer is not None:
@@ -688,6 +678,133 @@ def cl_stream_coarse(index: Any):
 
             return ll_stream_coarse
 
+        @neon.Container.factory(name="finest_fused_pull")
+        def finest_fused_pull(
+                level: int,
+                f_0_fd: Any,
+                f_1_fd: Any,
+                bc_mask_fd: Any,
+                missing_mask_fd: Any,
+                omega: Any,
+                timestep: Any,
+                is_f1_the_explosion_src_field: bool,
+                is_f1_the_coalescence_dst_field: bool,
+        ):
+            if level != 0:
+                # throw an exception
+                raise Exception("Only the finest level is supported for now")
+
+            num_levels = f_0_fd.get_grid().get_num_levels()
+
+            # if level != 0:
+            #     # throw an exception
+            #     raise Exception("Only the finest level is supported for now")
+
+            # module op to define odd of even iteration
+            # od_or_even = wp.module("odd_or_even", "even")
+
+            def ll_stream_coarse(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+
+                if level + 1 < f_0_fd.get_grid().get_num_levels():
+                    f_0_pn = loader.get_mres_write_handle(f_0_fd, neon.Loader.Operation.stencil_up)
+                    f_1_pn = loader.get_mres_write_handle(f_1_fd, neon.Loader.Operation.stencil_up)
+                else:
+                    f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                    f_1_pn = loader.get_mres_write_handle(f_1_fd)
+
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
+
+                _c = self.velocity_set.c
+                _w = self.velocity_set.w
+
+                @wp.func
+                def cl_stream_coarse(index: Any):
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    if _boundary_id == wp.uint8(255):
+                        return
+
+                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
+                    if are_we_a_halo_cell:
+                        # HERE: we are a halo cell so we just exit
+                        return
+
+                    # do stream normally
+                    _missing_mask = _missing_mask_vec()
+                    _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
+                    _f_post_collision = _f0_thread
+                    _f_post_stream = self.stream.neon_functional(f_0_pn, index)
+
+                    for l in range(self.velocity_set.q):
+                        if l == lattice_central_index:
+                            # HERE, we skip the center direction
+                            continue
+
+                        pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
+
+                        has_ngh_at_same_level = wp.bool(False)
+                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0),
+                                                       has_ngh_at_same_level)
+
+                        # NO finer ngh. in the pull direction (opposite of l)
+                        if not has_ngh_at_same_level:
+                            # NO ngh. at the same level
+                            # COULD we have a ngh. at the courser level?
+                            if wp.neon_has_parent(f_0_pn, index):
+                                # YES halo cell on top of us
+                                has_a_courser_ngh = wp.bool(False)
+                                if is_f1_the_explosion_src_field:
+                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(
+                                        f_1_pn, index, pull_direction, l, self.compute_dtype(0), has_a_courser_ngh
+                                    )
+                                else:
+                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(
+                                        f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_courser_ngh
+                                    )
+                                if has_a_courser_ngh:
+                                    # Full state:
+                                    # NO finer ngh. in the pull direction (opposite of l)
+                                    # NO ngh. at the same level
+                                    # YES ghost cell on top of us
+                                    # YES courser ngh.
+                                    # -> **Explosion**
+                                    # wp.neon_write(f_1_pn, index, l, exploded_pop)
+                                    _f_post_stream[l] = exploded_pop
+
+                    # do non mres post-streaming corrections
+                    _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn,
+                                              _f_post_collision, _f_post_stream, True)
+
+                    _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
+                    _feq = self.equilibrium.neon_functional(_rho, _u)
+                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
+
+                    # Apply post-collision boundary conditions
+                    _f_post_collision = apply_bc(
+                        index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream,
+                        _f_post_collision, False
+                    )
+
+                    # Apply auxiliary recovery for boundary conditions (swapping) before overwriting f_1
+                    neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
+
+                    # Accumulate the post-collision populations in f_0
+                    for l in range(self.velocity_set.q):
+                        push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
+                        if level < num_levels - 1:
+                            val = _f_post_collision[l]
+                            if is_f1_the_explosion_src_field:
+                                wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, val)
+                            else:
+                                wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, val)
+
+                        wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+
+                loader.declare_kernel(cl_stream_coarse)
+
+            return ll_stream_coarse
+
         @neon.Container.factory(name="stream_coarse_step_C")
         def stream_coarse_step_C(
             level: int,
@@ -745,14 +862,22 @@ def cl_stream_coarse(index: Any):
             "stream_coarse_step_A": stream_coarse_step_A,
             "stream_coarse_step_B": stream_coarse_step_B,
             "stream_coarse_step_C": stream_coarse_step_C,
+            "finest_fused_pull": finest_fused_pull,  # Placeholder for future use
         }
 
     def launch_container(self, streamId, op_name, mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep):
         self.neon_container[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep).run(0)
 
-    def add_to_app(self, app, op_name, mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep):
-        app.append(self.neon_container[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep))
-
+    def add_to_app(self, app, op_name, mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep,
+        is_f1_the_explosion_src_field: bool = None,
+        is_f1_the_coalescence_dst_field    : bool = None):
+        nvtx.push_range(f"New Container {op_name}", color="yellow")
+        if is_f1_the_explosion_src_field is None:
+            app.append(self.neon_container[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep))
+        else:
+            app.append(self.neon_container[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep,
+                                                    is_f1_the_explosion_src_field, is_f1_the_coalescence_dst_field))
+        nvtx.pop_range()
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_launch(self, f_0, f_1, bc_mask, missing_mask, omega, timestep):
         c = self.neon_container(f_0, f_1, bc_mask, missing_mask, omega, timestep)
diff --git a/xlb/optimization_type.py b/xlb/optimization_type.py
new file mode 100644
index 00000000..0e3bbe62
--- /dev/null
+++ b/xlb/optimization_type.py
@@ -0,0 +1,71 @@
+import argparse
+from enum import Enum
+
+
+class OptimizationType(Enum):
+    """
+    Enumeration of available optimization strategies for the LBM solver.
+
+    Supports parsing from either the enum member name (case-insensitive)
+    or its integer value, and provides a method to build the CLI parser.
+    """
+    NAIVE_COLLIDE_STREAM = 0
+    FUSION_AT_FINEST = 1
+
+    @staticmethod
+    def from_string(value: str) -> "OptimizationType":
+        """
+        Parse a string to an OptimizationType.
+
+        Accepts either the enum member name (case-insensitive) or its integer value.
+
+        Args:
+            value: The enum name (e.g. 'naive_collide_stream') or integer value (e.g. '0').
+
+        Returns:
+            An OptimizationType member.
+
+        Raises:
+            argparse.ArgumentTypeError: If the input is invalid.
+        """
+        # Attempt to parse by name (case-insensitive)
+        key = value.strip().upper()
+        if key in OptimizationType.__members__:
+            return OptimizationType[key]
+
+        # Attempt to parse by integer value
+        try:
+            int_value = int(value)
+            return OptimizationType(int_value)
+        except (ValueError, KeyError):
+            valid_options = ", ".join(f"{member.name}({member.value})" for member in OptimizationType)
+            raise argparse.ArgumentTypeError(
+                f"Invalid OptimizationType {value!r}. Choose from: {valid_options}."
+            )
+
+    def __str__(self) -> str:
+        """
+        Return a human-readable string for the enum member.
+        """
+        return self.name
+
+    @staticmethod
+    def build_arg_parser() -> argparse.ArgumentParser:
+        """
+        Create and configure the argument parser with optimization option.
+
+        Returns:
+            A configured ArgumentParser instance.
+        """
+        parser = argparse.ArgumentParser(
+            description="Run the LBM multiresolution simulation with specified optimizations."
+        )
+        # Dynamically generate help text from enum members
+        valid_options = ", ".join(f"{member.name}({member.value})" for member in OptimizationType)
+        parser.add_argument(
+            "-o", "--optimization",
+            type=OptimizationType.from_string,
+            default=OptimizationType.NAIVE_COLLIDE_STREAM,
+            help=f"Select optimization strategy: {valid_options}",
+        )
+        return parser

From b27094e03e46ded8bfcc4a02710f654ffb45430c Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 18 Jul 2025 10:53:58 -0400
Subject: [PATCH 121/208] Generalized the MultiresIO class to be used for any
 given field (not predefined fields)

---
 .../cuboid_flow_past_sphere_3d.py             |  11 +-
 .../grid_refinement/flow_past_sphere_3d.py    |   2 +-
 xlb/utils/mesher.py                           | 199 +++++++++++-------
 3 files changed, 130 insertions(+), 82 deletions(-)

diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
index 44abbe47..df01a955 100644
--- a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
@@ -138,7 +138,7 @@ def prepare_sparsity_pattern(level_data):
 from xlb.utils import MultiresIO
 
 # Define an exporter for the multiresolution data
-exporter = MultiresIO(level_data)
+exporter = MultiresIO({"velocity": 3, "density": 1}, level_data)
 
 # Prepare the sparsity pattern and origins from the level data
 sparsity_pattern, level_origins = prepare_sparsity_pattern(level_data)
@@ -274,15 +274,12 @@ def print_lift_drag(sim):
         # Call the exporter to save the current state
         nx, ny, nz = grid_shape_finest
         filename = f"multires_flow_over_sphere_3d_{step:04d}"
-        exporter.to_hdf5(filename, sim.u, sim.rho, compression="gzip", compression_opts=2)
+        exporter.to_hdf5(filename, {"velocity": sim.u, "density": sim.rho}, compression="gzip", compression_opts=2)
         exporter.to_slice_image(
-            "velocity_x",
-            sim.u,
-            sim.rho,
+            filename,
+            {"velocity": sim.u},
             plane_point=(nx // 2, ny // 2, nz // 2),
             plane_normal=(0, 0, 1),
-            slice_thickness=1.0,
-            output_filename=f"{filename}_slice_x.png",
             grid_res=256,
             bounds=(0.4, 0.6, 0.4, 0.6),
         )
diff --git a/examples/cfd/grid_refinement/flow_past_sphere_3d.py b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
index cfe5fe08..7e87c332 100644
--- a/examples/cfd/grid_refinement/flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
@@ -69,7 +69,7 @@
 #     level_data.append(
 #         [level_list[level].astype(bool), voxel_size, level_origins[level], level],
 #     )
-# exporter = MultiresIO(level_data)
+# exporter = MultiresIO({"velocity": 3, "density": 1}, level_data)
 
 # Create the multires grid
 grid = multires_grid_factory(
diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index e323c644..573b760c 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -126,12 +126,15 @@ def make_cuboid_mesh(voxel_size, cuboids, stl_filename):
 
 
 class MultiresIO(object):
-    def __init__(self, levels_data, scale=1, offset=(0.0, 0.0, 0.0)):
+    def __init__(self, field_name_cardinality_dict, levels_data, scale=1, offset=(0.0, 0.0, 0.0)):
         """
         Initialize the MultiresIO object.
 
         Parameters
         ----------
+        field_name_cardinality_dict : dict
+            A dictionary mapping field names to their cardinalities.
+            Example: {'velocity_x': 1, 'velocity_y': 1, 'velocity': 3, 'density': 1}
         levels_data : list of tuples
             Each tuple contains (data, voxel_size, origin, level).
         scale : float or tuple, optional
@@ -152,6 +155,7 @@ def __init__(self, levels_data, scale=1, offset=(0.0, 0.0, 0.0)):
         coordinates = self._transform_coordinates(coordinates, scale, offset)
 
         # Assign to self
+        self.field_name_cardinality_dict = field_name_cardinality_dict
         self.levels_data = levels_data
         self.coordinates = coordinates
         self.connectivity = connectivity
@@ -159,7 +163,7 @@ def __init__(self, levels_data, scale=1, offset=(0.0, 0.0, 0.0)):
         self.total_cells = total_cells
 
         # Prepare and allocate the inputs for the NEON container
-        self.velocity_warp_list, self.density_warp_list, self.origin_list = self._prepare_container_inputs()
+        self.field_warp_dict, self.origin_list = self._prepare_container_inputs()
 
         # Construct the NEON container for exporting multi-resolution data
         self.container = self._construct_neon_container()
@@ -288,7 +292,7 @@ def save_xdmf(self, h5_filename, xmf_filename, total_cells, num_points, fields={
         print("\tXDMF file written successfully")
         return
 
-    def save_hdf5_file(self, filename, coordinates, connectivity, level_id_field, field_data, compression="gzip", compression_opts=0):
+    def save_hdf5_file(self, filename, coordinates, connectivity, level_id_field, fields_data, compression="gzip", compression_opts=0):
         """Write the processed mesh data to an HDF5 file.
         Parameters
         ----------
@@ -300,7 +304,7 @@ def save_hdf5_file(self, filename, coordinates, connectivity, level_id_field, fi
             An array of all connectivity data.
         level_id_field : numpy.ndarray
             An array of all level data.
-        field_data : dict
+        fields_data : dict
             A dictionary of all field data.
         compression : str, optional
             The compression method to use for the HDF5 file.
@@ -320,7 +324,7 @@ def save_hdf5_file(self, filename, coordinates, connectivity, level_id_field, fi
             )
             f.create_dataset("/Mesh/Level", data=level_id_field, compression=compression, compression_opts=compression_opts)
             fg = f.create_group("/Fields")
-            for fname, fdata in field_data.items():
+            for fname, fdata in fields_data.items():
                 fg.create_dataset(fname, data=fdata.astype(np.float32), compression=compression, compression_opts=compression_opts, chunks=True)
 
     def _merge_duplicates(self, coordinates, connectivity, levels_data):
@@ -374,20 +378,20 @@ def _prepare_container_inputs(self, store_precision=None):
             store_precision = DefaultConfig.default_precision_policy.store_precision
 
         # Prepare lists to hold warp fields and origins allocated for each level
-        velocity_warp_list = []
-        density_warp_list = []
+        field_warp_dict = {}
         origin_list = []
-        for level in range(num_levels):
-            # get the shape of the grid at this level
-            box_shape = self.levels_data[level][0].shape
+        for field_name, cardinality in self.field_name_cardinality_dict.items():
+            field_warp_dict[field_name] = []
+            for level in range(num_levels):
+                # get the shape of the grid at this level
+                box_shape = self.levels_data[level][0].shape
 
-            # Use the warp backend to create dense fields to be written in multi-res NEON fields
-            grid_dense = grid_factory(box_shape, compute_backend=ComputeBackend.WARP)
-            velocity_warp_list.append(grid_dense.create_field(cardinality=3, dtype=store_precision))
-            density_warp_list.append(grid_dense.create_field(cardinality=1, dtype=store_precision))
-            origin_list.append(wp.vec3i(*([int(x) for x in self.levels_data[level][2]])))
+                # Use the warp backend to create dense fields to be written in multi-res NEON fields
+                grid_dense = grid_factory(box_shape, compute_backend=ComputeBackend.WARP)
+                field_warp_dict[field_name].append(grid_dense.create_field(cardinality=cardinality, dtype=store_precision))
+                origin_list.append(wp.vec3i(*([int(x) for x in self.levels_data[level][2]])))
 
-        return velocity_warp_list, density_warp_list, origin_list
+        return field_warp_dict, origin_list
 
     def _construct_neon_container(self):
         """
@@ -397,22 +401,19 @@ def _construct_neon_container(self):
 
         @neon.Container.factory(name="HDF5MultiresExporter")
         def container(
-            velocity_neon: Any,
-            density_neon: Any,
-            velocity_warp: Any,
-            density_warp: Any,
+            field_neon: Any,
+            field_warp: Any,
             origin: Any,
             level: Any,
         ):
             def launcher(loader: neon.Loader):
-                loader.set_mres_grid(velocity_neon.get_grid(), level)
-                velocity_neon_hdl = loader.get_mres_read_handle(velocity_neon)
-                density_neon_hdl = loader.get_mres_read_handle(density_neon)
+                loader.set_mres_grid(field_neon.get_grid(), level)
+                field_neon_hdl = loader.get_mres_read_handle(field_neon)
                 refinement = 2**level
 
                 @wp.func
                 def kernel(index: Any):
-                    cIdx = wp.neon_global_idx(velocity_neon_hdl, index)
+                    cIdx = wp.neon_global_idx(field_neon_hdl, index)
                     # Get local indices by dividing the global indices (associated with the finest level) by 2^level
                     # Subtract the origin to get the local indices in the warp field
                     lx = wp.neon_get_x(cIdx) // refinement - origin[0]
@@ -420,9 +421,9 @@ def kernel(index: Any):
                     lz = wp.neon_get_z(cIdx) // refinement - origin[2]
 
                     # write the values to the warp field
-                    density_warp[0, lx, ly, lz] = wp.neon_read(density_neon_hdl, index, 0)
-                    for card in range(3):
-                        velocity_warp[card, lx, ly, lz] = wp.neon_read(velocity_neon_hdl, index, card)
+                    cardinality = field_warp.shape[0]
+                    for card in range(cardinality):
+                        field_warp[card, lx, ly, lz] = wp.neon_read(field_neon_hdl, index, card)
 
                 loader.declare_kernel(kernel)
 
@@ -430,41 +431,46 @@ def kernel(index: Any):
 
         return container
 
-    def get_fields_data(self, velocity_neon, density_neon):
+    def get_fields_data(self, field_neon_dict):
         """
         Extracts and prepares the fields data from the NEON fields for export.
         """
         # Ensure that this operator is called on multires grids
-        grid_mres = velocity_neon.get_grid()
+        grid_mres = next(iter(field_neon_dict.values())).get_grid()
         assert grid_mres.get_name() == "mGrid", f"Operation {self.__class__.__name} is only applicable to multi-resolution cases"
 
+        for field_name in field_neon_dict.keys():
+            assert field_name in self.field_name_cardinality_dict.keys(), (
+                f"Field {field_name} is not provided in the instantiation of the MultiresIO class!"
+            )
+
         # number of levels
         num_levels = grid_mres.get_num_levels()
         assert num_levels == len(self.levels_data), "Error: Inconsistent number of levels!"
 
-        # Prepare the fields to be written by transfering multi-res NEON fields into stacked warp fields
-        fields_data = {
-            "velocity_x": [],
-            "velocity_y": [],
-            "velocity_z": [],
-            "density": [],
-        }
-        for level in range(num_levels):
-            # Create the container and run it to fill the warp fields
-            c = self.container(
-                velocity_neon, density_neon, self.velocity_warp_list[level], self.density_warp_list[level], self.origin_list[level], level
-            )
-            c.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
-
-            # Convert the warp fields to numpy arrays and use level's mask to filter the data
-            mask = self.levels_data[level][0]
-            velocity_np = np.array(wp.to_jax(self.velocity_warp_list[level]))
-            rho = np.array(wp.to_jax(self.density_warp_list[level]))[0][mask]
-            vx, vy, vz = velocity_np[0][mask], velocity_np[1][mask], velocity_np[2][mask]
-            fields_data["velocity_x"].append(vx)
-            fields_data["velocity_y"].append(vy)
-            fields_data["velocity_z"].append(vz)
-            fields_data["density"].append(rho)
+        # Prepare the fields dictionary to be written by transfering multi-res NEON fields into stacked warp fields and then numpy arrays
+        fields_data = {}
+        for field_name, cardinality in self.field_name_cardinality_dict.items():
+            if field_name not in field_neon_dict:
+                continue
+            for card in range(cardinality):
+                fields_data[f"{field_name}_{card}"] = []
+
+        # Iterate over each field and level to fill the dictionary with numpy fields
+        for field_name, cardinality in self.field_name_cardinality_dict.items():
+            if field_name not in field_neon_dict:
+                continue
+            for level in range(num_levels):
+                # Create the container and run it to fill the warp fields
+                c = self.container(field_neon_dict[field_name], self.field_warp_dict[field_name][level], self.origin_list[level], level)
+                c.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
+
+                # Convert the warp fields to numpy arrays and use level's mask to filter the data
+                mask = self.levels_data[level][0]
+                field_np = np.array(wp.to_jax(self.field_warp_dict[field_name][level]))
+                for card in range(cardinality):
+                    field_np_card = field_np[card][mask]
+                    fields_data[f"{field_name}_{card}"].append(field_np_card)
 
         # Concatenate all field data
         for field_name in fields_data.keys():
@@ -473,17 +479,15 @@ def get_fields_data(self, velocity_neon, density_neon):
 
         return fields_data
 
-    def to_hdf5(self, filename, velocity_neon, density_neon, compression="gzip", compression_opts=0, store_precision=None):
+    def to_hdf5(self, output_filename, field_neon_dict, compression="gzip", compression_opts=0, store_precision=None):
         """
         Export the multi-resolution mesh data to an HDF5 file.
         Parameters
         ----------
-        filename : str
+        output_filename : str
             The name of the output HDF5 file (without extension).
-        velocity_neon : neon mGrid Field
-            The NEON field containing velocity data.
-        density_neon : neon mGrid Field
-            The NEON field containing density data.
+        field_neon_dict : a dictionary of neon mGrid Fields
+            Eg. The NEON fields containing velocity and density data as { "velocity": velocity_neon, "density": density_neon}
         compression : str, optional
             The compression method to use for the HDF5 file.
         compression_opts : int, optional
@@ -494,46 +498,47 @@ def to_hdf5(self, filename, velocity_neon, density_neon, compression="gzip", com
         import time
 
         # Get the fields data from the NEON fields
-        fields_data = self.get_fields_data(velocity_neon, density_neon)
+        fields_data = self.get_fields_data(field_neon_dict)
 
         # Save XDMF file
-        self.save_xdmf(filename + ".h5", filename + ".xmf", self.total_cells, len(self.coordinates), fields_data)
+        self.save_xdmf(output_filename + ".h5", output_filename + ".xmf", self.total_cells, len(self.coordinates), fields_data)
 
         # Writing HDF5 file
         print("\tWriting HDF5 file")
         tic_write = time.perf_counter()
-        self.save_hdf5_file(filename, self.coordinates, self.connectivity, self.level_id_field, fields_data, compression, compression_opts)
+        self.save_hdf5_file(output_filename, self.coordinates, self.connectivity, self.level_id_field, fields_data, compression, compression_opts)
         toc_write = time.perf_counter()
         print(f"\tHDF5 file written in {toc_write - tic_write:0.1f} seconds")
 
     def to_slice_image(
         self,
-        field_name,
-        velocity_neon,
-        density_neon,
+        output_filename,
+        field_neon_dict,
         plane_point,
         plane_normal,
-        slice_thickness,
-        output_filename,
+        slice_thickness=1.0,
         bounds=[0, 1, 0, 1],
         grid_res=512,
         cmap=None,
+        component=None,
+        **kwargs,
     ):
         """
         Export an arbitrary-plane slice from unstructured point data to PNG.
 
         Parameters
         ----------
-        field_name : str
-            The field to plot.
+        output_filename : str
+            Output PNG filename (without extension).
+        field_neon_dict : dict
+            A dictionary of NEON fields containing the data to be plotted.
+            Example: {"velocity": velocity_neon, "density": density_neon}
         plane_point : array_like
             A point [x, y, z] on the plane.
         plane_normal : array_like
             Plane normal vector [nx, ny, nz].
         slice_thickness : float
             How thick (in units of the coordinate system) the slice should be.
-        output_filename : str
-            Output PNG filename (without extension).
         grid_resolution : tuple
             Resolution of output image (pixels in plane u, v directions).
         grid_size : tuple
@@ -541,14 +546,59 @@ def to_slice_image(
         cmap : str
             Matplotlib colormap.
         """
+        # Get the fields data from the NEON fields
+        assert len(field_neon_dict.keys()) == 1, "Error: This function is designed to plot a single field at a time."
+        fields_data = self.get_fields_data(field_neon_dict)
+
+        # Check if the component is within the valid range
+        if component is None:
+            print("\tCreating slice image of the field magnitude!")
+            cell_data = list(fields_data.values())
+            squared = [comp**2 for comp in cell_data]
+            cell_data = np.sqrt(sum(squared))
+            field_name = list(fields_data.keys())[0].split('_')[0] + '_magnitude'
+        else:
+            assert component < max(self.field_name_cardinality_dict.values()), f"Error: Component {component} is out of range for the provided fields."
+            print(f"\tCreating slice image for component {component} of the input field!")
+            field_name = list(fields_data.keys())[component]
+            cell_data = fields_data[field_name]
+
+        # Plot each field in the dictionary
+        self._to_slice_image_single_field(
+            f"{output_filename}_{field_name}",
+            cell_data,
+            plane_point,
+            plane_normal,
+            slice_thickness=slice_thickness,
+            bounds=bounds,
+            grid_res=grid_res,
+            cmap=cmap,
+            **kwargs,
+        )
+        print(f"\tSlice image for field {field_name} saved as {output_filename}.png")
+
+    def _to_slice_image_single_field(
+        self,
+        output_filename,
+        field_data,
+        plane_point,
+        plane_normal,
+        slice_thickness,
+        bounds,
+        grid_res,
+        cmap,
+        **kwargs,
+    ):
+        """
+        Helper function to create a slice image for a single field.
+        """
         from matplotlib import cm
         import numpy as np
         import matplotlib.pyplot as plt
         from scipy.interpolate import griddata
 
-        # Get the fields data from the NEON fields
-        fields_data = self.get_fields_data(velocity_neon, density_neon)
-        cell_values = fields_data[field_name]
+        # field data are associated with the cells centers
+        cell_values = field_data
 
         # get the normalized plane normal
         plane_normal = np.asarray(plane_normal)
@@ -609,7 +659,8 @@ def to_slice_image(
             cmap=cmap,
             origin="lower",
             aspect="equal",
+            **kwargs,
         )
-        plt.colorbar(label=field_name)
+        plt.colorbar()
         plt.savefig(output_filename + ".png", dpi=300, bbox_inches="tight")
         plt.close()

From 59ef4aaef858312583c4b24fb943706d65ddbaa2 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 18 Jul 2025 17:00:15 -0400
Subject: [PATCH 122/208] Added a new method to multires grid class to enable
 extracting boundary indices across multiple levels.

---
 .../cuboid_flow_past_sphere_3d.py             |  1 +
 examples/performance/mlups_3d_multires.py     | 18 ---------
 xlb/grid/multires_grid.py                     | 40 +++++++++++++++++++
 xlb/utils/mesher.py                           | 29 +++++++-------
 4 files changed, 55 insertions(+), 33 deletions(-)

diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
index df01a955..eda1ce5d 100644
--- a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
@@ -203,6 +203,7 @@ def bc_profile_warp(index: wp.vec3i):
 # Alternatively, use a prescribed velocity profile
 # bc_left = RegularizedBC("velocity", prescribed_value=(u_max, 0.0, 0.0), indices=inlet)
 bc_walls = FullwayBounceBackBC(indices=walls)  # TODO: issues with halfway bounce back only here!
+# bc_ground = FullwayBounceBackBC(indices=grid.boundary_indices_across_levels(level_data, box_side="front"))
 # bc_outlet = ExtrapolationOutflowBC(indices=outlet)
 bc_outlet = DoNothingBC(indices=outlet)
 bc_sphere = HalfwayBounceBackBC(mesh_vertices=sphere, voxelization_method=MeshVoxelizationMethod.AABB)
diff --git a/examples/performance/mlups_3d_multires.py b/examples/performance/mlups_3d_multires.py
index ab059032..f5df5fc3 100644
--- a/examples/performance/mlups_3d_multires.py
+++ b/examples/performance/mlups_3d_multires.py
@@ -61,24 +61,6 @@ def setup_simulation(args):
     return velocity_set
 
 
-# def construct_indices_per_level(grid_shape_finest, indices_finest, active_voxels_mask_per_level, level_origins):
-#     # TODO: HS: This is not the efficient way of doing this. I need to write a Warp operator for this purpose
-#     num_levels = len(active_voxels_mask_per_level)
-#     indices_list = []
-#     for level in range(num_levels):
-#         refinement = 2**level
-#         grid_shape = tuple(x // refinement for x in grid_shape_finest)
-#         mask = np.zeros(grid_shape, dtype=bool)
-#         ox, oy, oz = level_origins[level]
-#         Lx, Ly, Lz = active_voxels_mask_per_level[level].shape
-#         mask[ox : ox + Lx, oy : oy + Ly, oz : oz + Lz] = active_voxels_mask_per_level[level]
-#         indices_per_level = (np.array(indices_finest) // refinement)[:, ::refinement]
-#         mask_per_level = mask[tuple(indices_per_level)]
-#         active_bc_indices_per_level = indices_per_level[:, mask_per_level].tolist()
-#         indices_list.append(active_bc_indices_per_level)
-#     return indices_list
-
-
 def problem1(grid_shape, velocity_set):
     def peel(dim, idx, peel_level, outwards):
         if outwards:
diff --git a/xlb/grid/multires_grid.py b/xlb/grid/multires_grid.py
index b86fed2c..1915fd98 100644
--- a/xlb/grid/multires_grid.py
+++ b/xlb/grid/multires_grid.py
@@ -93,3 +93,43 @@ def get_neon_backend(self):
     def level_to_shape(self, level):
         # level = 0 corresponds to the finest level
         return tuple(x // self.refinement_factor**level for x in self.shape)
+
+    def boundary_indices_across_levels(self, level_data, box_side: str = "front"):
+        """
+        Get indices for creating a boundary condition on the specified box side that crosses multiples levels of a multiresolution grid.
+        The indices are returned as a list of lists, where each sublist corresponds to a level
+
+        Parameters
+        ----------
+        - level_data: Level data containing the origins and sparsity patterns for each level as prepared by mesher/make_cuboid_mesh function!
+        - box_side: The side of the bounding box to get indices for (default is "front").
+        returns:
+        - A list of lists, where each sublist contains the indices for the boundary condition at that level.
+        """
+        num_levels = len(level_data)
+        bc_indices_list = []
+        for level in range(num_levels):
+            # Find active indices at this level
+            mask = level_data[level][0]
+            origin = level_data[level][2]
+            active_indices = np.nonzero(mask) + origin[:, None]
+
+            # Get bottom indices of the bounding box at this level
+            grid_shape = self.level_to_shape(level)
+            box = self.bounding_box_indices(shape=grid_shape)
+            bc_indices = np.array([box[box_side][i] for i in range(self.velocity_set.d)])
+
+            # Convert to flat indices
+            bc_indices = np.ravel_multi_index(bc_indices, grid_shape)
+            active_indices = np.ravel_multi_index(active_indices, grid_shape)
+
+            # Find common rows
+            common = np.intersect1d(active_indices, bc_indices)
+
+            # Append common points at this level to a list
+            if common.size == 0:
+                bc_indices_list.append([])
+            else:
+                active_bc_indices = np.unravel_index(common, grid_shape)
+                bc_indices_list.append([arr.tolist() for arr in active_bc_indices])
+        return bc_indices_list
diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index 573b760c..198b02ba 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -8,24 +8,27 @@
 
 def adjust_bbox(cuboid_max, cuboid_min, voxel_size_coarsest):
     """
-    Adjust the bounding box to the nearest level 0 grid points that enclose the desired region.
+    Adjust the bounding box to the nearest points of coarsest level that enclose the desired region.
 
     Args:
         cuboid_min (np.ndarray): Desired minimum coordinates of the bounding box.
         cuboid_max (np.ndarray): Desired maximum coordinates of the bounding box.
-        voxel_size_coarsest (float): Voxel size of the coarsest grid (level 0).
+        voxel_size_coarsest (float): Voxel size of the coarsest grid.
 
     Returns:
-        tuple: (adjusted_min, adjusted_max) snapped to level 0 grid points.
+        tuple: (adjusted_min, adjusted_max) snapped to coarsest level grid points.
     """
     adjusted_min = np.round(cuboid_min / voxel_size_coarsest) * voxel_size_coarsest
     adjusted_max = np.round(cuboid_max / voxel_size_coarsest) * voxel_size_coarsest
+
+    # Ensure that the adjusted min is not less than zero
+    adjusted_min = np.maximum(adjusted_min, 0)
     return adjusted_min, adjusted_max
 
 
 def make_cuboid_mesh(voxel_size, cuboids, stl_filename):
     """
-    Create a multi-level cuboid mesh with bounding boxes aligned to the level 0 grid.
+    Create a multi-level cuboid mesh with bounding boxes aligned to the coarsest level grid.
     Voxel matrices are set to ones only in regions not covered by finer levels.
 
     Args:
@@ -71,23 +74,17 @@ def make_cuboid_mesh(voxel_size, cuboids, stl_filename):
 
         # Set voxel size for this level
         voxel_size_level = max_voxel_size / pow(2, level)
-        if level > 0:
-            voxel_level_up = max_voxel_size / pow(2, level - 1)
-        else:
-            voxel_level_up = voxel_size_level
-        # Adjust bounding box to align with level 0 grid
-        adjusted_min, adjusted_max = adjust_bbox(cuboid_max, cuboid_min, voxel_level_up)
 
+        # Adjust bounding box to align with coarsest level grid
+        adjusted_min, adjusted_max = adjust_bbox(cuboid_max, cuboid_min, max_voxel_size)
         xmin, ymin, zmin = adjusted_min
         xmax, ymax, zmax = adjusted_max
 
-        cuboid = adjusted_max - adjusted_min
-
         # Compute number of voxels based on level-specific voxel size
         nx = int(np.round((xmax - xmin) / voxel_size_level))
         ny = int(np.round((ymax - ymin) / voxel_size_level))
         nz = int(np.round((zmax - zmin) / voxel_size_level))
-        print(f"Domain {nx}, {ny}, {nz}  Origin {adjusted_min}  Voxel Size {voxel_size_level} Voxel Level Up {voxel_level_up}")
+        print(f"Domain {nx}, {ny}, {nz}  Origin {adjusted_min}  Voxel Size {voxel_size_level}")
 
         voxel_matrix = np.ones((nx, ny, nz), dtype=bool)
 
@@ -556,9 +553,11 @@ def to_slice_image(
             cell_data = list(fields_data.values())
             squared = [comp**2 for comp in cell_data]
             cell_data = np.sqrt(sum(squared))
-            field_name = list(fields_data.keys())[0].split('_')[0] + '_magnitude'
+            field_name = list(fields_data.keys())[0].split("_")[0] + "_magnitude"
         else:
-            assert component < max(self.field_name_cardinality_dict.values()), f"Error: Component {component} is out of range for the provided fields."
+            assert component < max(self.field_name_cardinality_dict.values()), (
+                f"Error: Component {component} is out of range for the provided fields."
+            )
             print(f"\tCreating slice image for component {component} of the input field!")
             field_name = list(fields_data.keys())[component]
             cell_data = fields_data[field_name]

From 706f6043449f5e40caff5c1a9dc658366b7e7954 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Sun, 20 Jul 2025 20:31:48 -0400
Subject: [PATCH 123/208] cast to store dtype for any input type of mres IO

---
 xlb/utils/mesher.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index 198b02ba..a347bf1d 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -123,7 +123,7 @@ def make_cuboid_mesh(voxel_size, cuboids, stl_filename):
 
 
 class MultiresIO(object):
-    def __init__(self, field_name_cardinality_dict, levels_data, scale=1, offset=(0.0, 0.0, 0.0)):
+    def __init__(self, field_name_cardinality_dict, levels_data, scale=1, offset=(0.0, 0.0, 0.0), store_precision=None):
         """
         Initialize the MultiresIO object.
 
@@ -159,6 +159,13 @@ def __init__(self, field_name_cardinality_dict, levels_data, scale=1, offset=(0.
         self.level_id_field = level_id_field
         self.total_cells = total_cells
 
+        # Set the default precision policy if not provided
+        from xlb import DefaultConfig
+
+        if store_precision is None:
+            self.store_precision = DefaultConfig.default_precision_policy.store_precision
+            self.store_dtype = DefaultConfig.default_precision_policy.store_precision.wp_dtype
+
         # Prepare and allocate the inputs for the NEON container
         self.field_warp_dict, self.origin_list = self._prepare_container_inputs()
 
@@ -361,19 +368,14 @@ def _transform_coordinates(self, coordinates, scale, offset):
         offset = np.array(offset, dtype=np.float32)
         return coordinates * scale + offset
 
-    def _prepare_container_inputs(self, store_precision=None):
+    def _prepare_container_inputs(self):
         # load necessary modules
         from xlb.compute_backend import ComputeBackend
         from xlb.grid import grid_factory
-        from xlb import DefaultConfig
 
         # Get the number of levels from the levels_data
         num_levels = len(self.levels_data)
 
-        # Set the default precision policy if not provided
-        if store_precision is None:
-            store_precision = DefaultConfig.default_precision_policy.store_precision
-
         # Prepare lists to hold warp fields and origins allocated for each level
         field_warp_dict = {}
         origin_list = []
@@ -385,7 +387,7 @@ def _prepare_container_inputs(self, store_precision=None):
 
                 # Use the warp backend to create dense fields to be written in multi-res NEON fields
                 grid_dense = grid_factory(box_shape, compute_backend=ComputeBackend.WARP)
-                field_warp_dict[field_name].append(grid_dense.create_field(cardinality=cardinality, dtype=store_precision))
+                field_warp_dict[field_name].append(grid_dense.create_field(cardinality=cardinality, dtype=self.store_precision))
                 origin_list.append(wp.vec3i(*([int(x) for x in self.levels_data[level][2]])))
 
         return field_warp_dict, origin_list
@@ -420,7 +422,7 @@ def kernel(index: Any):
                     # write the values to the warp field
                     cardinality = field_warp.shape[0]
                     for card in range(cardinality):
-                        field_warp[card, lx, ly, lz] = wp.neon_read(field_neon_hdl, index, card)
+                        field_warp[card, lx, ly, lz] = self.store_dtype(wp.neon_read(field_neon_hdl, index, card))
 
                 loader.declare_kernel(kernel)
 

From ea4a28c0d95080d27075acff40e30f29dcce7fb6 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Sun, 20 Jul 2025 20:39:59 -0400
Subject: [PATCH 124/208] added remove_edges as an input arg to
 boundary_indices_across_levels

---
 xlb/grid/multires_grid.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xlb/grid/multires_grid.py b/xlb/grid/multires_grid.py
index 1915fd98..749dbfe9 100644
--- a/xlb/grid/multires_grid.py
+++ b/xlb/grid/multires_grid.py
@@ -94,7 +94,7 @@ def level_to_shape(self, level):
         # level = 0 corresponds to the finest level
         return tuple(x // self.refinement_factor**level for x in self.shape)
 
-    def boundary_indices_across_levels(self, level_data, box_side: str = "front"):
+    def boundary_indices_across_levels(self, level_data, box_side: str = "front", remove_edges: bool = False):
         """
         Get indices for creating a boundary condition on the specified box side that crosses multiples levels of a multiresolution grid.
         The indices are returned as a list of lists, where each sublist corresponds to a level
@@ -116,7 +116,7 @@ def boundary_indices_across_levels(self, level_data, box_side: str = "front"):
 
             # Get bottom indices of the bounding box at this level
             grid_shape = self.level_to_shape(level)
-            box = self.bounding_box_indices(shape=grid_shape)
+            box = self.bounding_box_indices(shape=grid_shape, remove_edges=remove_edges)
             bc_indices = np.array([box[box_side][i] for i in range(self.velocity_set.d)])
 
             # Convert to flat indices

From b067e6b82e74570b676663b691917d19fa6b6cfb Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 21 Jul 2025 09:40:34 -0400
Subject: [PATCH 125/208] Fixed an error in cuboid mesher introduced in
 previous commits

---
 xlb/utils/mesher.py | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index a347bf1d..13ea10e2 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -6,30 +6,27 @@
 import warp as wp
 
 
-def adjust_bbox(cuboid_max, cuboid_min, voxel_size_coarsest):
+def adjust_bbox(cuboid_max, cuboid_min, voxel_size_up):
     """
-    Adjust the bounding box to the nearest points of coarsest level that enclose the desired region.
+    Adjust the bounding box to the nearest points of one level finer grid that encloses the desired region.
 
     Args:
         cuboid_min (np.ndarray): Desired minimum coordinates of the bounding box.
         cuboid_max (np.ndarray): Desired maximum coordinates of the bounding box.
-        voxel_size_coarsest (float): Voxel size of the coarsest grid.
+        voxel_size_up (float): Voxel size of one level higher (finer) grid.
 
     Returns:
-        tuple: (adjusted_min, adjusted_max) snapped to coarsest level grid points.
+        tuple: (adjusted_min, adjusted_max) snapped to grid points of one level higher.
     """
-    adjusted_min = np.round(cuboid_min / voxel_size_coarsest) * voxel_size_coarsest
-    adjusted_max = np.round(cuboid_max / voxel_size_coarsest) * voxel_size_coarsest
-
-    # Ensure that the adjusted min is not less than zero
-    adjusted_min = np.maximum(adjusted_min, 0)
+    adjusted_min = np.round(cuboid_min / voxel_size_up) * voxel_size_up
+    adjusted_max = np.round(cuboid_max / voxel_size_up) * voxel_size_up
     return adjusted_min, adjusted_max
 
 
 def make_cuboid_mesh(voxel_size, cuboids, stl_filename):
     """
-    Create a multi-level cuboid mesh with bounding boxes aligned to the coarsest level grid.
-    Voxel matrices are set to ones only in regions not covered by finer levels.
+    Create a strongly-balanced multi-level cuboid mesh with a sequence of bounding boxes.
+    Outputs mask arrays that are set to True only in regions not covered by finer levels.
 
     Args:
         voxel_size (float): Voxel size of the finest grid .
@@ -37,7 +34,7 @@ def make_cuboid_mesh(voxel_size, cuboids, stl_filename):
         stl_name (str): Path to the STL file.
 
     Returns:
-        list: Level data with voxel matrices, voxel sizes, origins, and levels.
+        list: Level data with mask arrays, voxel sizes, origins, and levels.
     """
     # Load the mesh and get its bounding box
     mesh = trimesh.load_mesh(stl_filename, process=False)
@@ -75,8 +72,13 @@ def make_cuboid_mesh(voxel_size, cuboids, stl_filename):
         # Set voxel size for this level
         voxel_size_level = max_voxel_size / pow(2, level)
 
-        # Adjust bounding box to align with coarsest level grid
-        adjusted_min, adjusted_max = adjust_bbox(cuboid_max, cuboid_min, max_voxel_size)
+        # Adjust bounding box to align with one level up (finer grid)
+        if level > 0:
+            voxel_level_up = max_voxel_size / pow(2, level - 1)
+        else:
+            voxel_level_up = voxel_size_level
+        adjusted_min, adjusted_max = adjust_bbox(cuboid_max, cuboid_min, voxel_level_up)
+
         xmin, ymin, zmin = adjusted_min
         xmax, ymax, zmax = adjusted_max
 
@@ -84,7 +86,7 @@ def make_cuboid_mesh(voxel_size, cuboids, stl_filename):
         nx = int(np.round((xmax - xmin) / voxel_size_level))
         ny = int(np.round((ymax - ymin) / voxel_size_level))
         nz = int(np.round((zmax - zmin) / voxel_size_level))
-        print(f"Domain {nx}, {ny}, {nz}  Origin {adjusted_min}  Voxel Size {voxel_size_level}")
+        print(f"Domain {nx}, {ny}, {nz}  Origin {adjusted_min}  Voxel Size {voxel_size_level} Voxel Level Up {voxel_level_up}")
 
         voxel_matrix = np.ones((nx, ny, nz), dtype=bool)
 

From e8f41aa5738fa791c7466258d04e2aece7136878 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 21 Jul 2025 16:32:58 -0400
Subject: [PATCH 126/208] fixed couple issues in MresIO class

---
 .../cfd/grid_refinement/cuboid_flow_past_sphere_3d.py     | 1 +
 xlb/utils/mesher.py                                       | 8 ++++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
index eda1ce5d..7d9d0a7a 100644
--- a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
@@ -282,6 +282,7 @@ def print_lift_drag(sim):
             plane_point=(nx // 2, ny // 2, nz // 2),
             plane_normal=(0, 0, 1),
             grid_res=256,
+            slice_thickness=2 ** (num_levels - 1),
             bounds=(0.4, 0.6, 0.4, 0.6),
         )
 
diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index 13ea10e2..c521c11e 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -436,6 +436,10 @@ def get_fields_data(self, field_neon_dict):
         """
         Extracts and prepares the fields data from the NEON fields for export.
         """
+        # Check if the field_neon_dict is empty
+        if not field_neon_dict:
+            return {}
+
         # Ensure that this operator is called on multires grids
         grid_mres = next(iter(field_neon_dict.values())).get_grid()
         assert grid_mres.get_name() == "mGrid", f"Operation {self.__class__.__name} is only applicable to multi-resolution cases"
@@ -604,7 +608,7 @@ def _to_slice_image_single_field(
         cell_values = field_data
 
         # get the normalized plane normal
-        plane_normal = np.asarray(plane_normal)
+        plane_normal = np.asarray(np.abs(plane_normal))
         n = plane_normal / np.linalg.norm(plane_normal)
 
         # Compute centroids (K = 8 for hexahedral cells)
@@ -632,7 +636,7 @@ def _to_slice_image_single_field(
             u1 = np.array([0, 1, 0])
         else:
             u1 = np.array([1, 0, 0])
-        u2 = np.cross(n, u1)
+        u2 = np.abs(np.cross(n, u1))
 
         local_x = np.dot(proj - plane_point, u1)
         local_y = np.dot(proj - plane_point, u2)

From e78db3b97fe136d4edcf03889cb75f3000e85acb Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 22 Jul 2025 09:34:12 -0400
Subject: [PATCH 127/208] addressed PR review comments

---
 examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
index df01a955..21c687f4 100644
--- a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
@@ -274,6 +274,7 @@ def print_lift_drag(sim):
         # Call the exporter to save the current state
         nx, ny, nz = grid_shape_finest
         filename = f"multires_flow_over_sphere_3d_{step:04d}"
+        wp.synchronize()
         exporter.to_hdf5(filename, {"velocity": sim.u, "density": sim.rho}, compression="gzip", compression_opts=2)
         exporter.to_slice_image(
             filename,

From 5bd71c7627b799d28a0857dbdf570dd7326bef30 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Tue, 22 Jul 2025 16:21:15 +0200
Subject: [PATCH 128/208] Refactoring: renaming DataUse to MemoryType

---
 xlb/grid/multires_grid.py                    | 4 ++--
 xlb/operator/stepper/nse_multires_stepper.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/xlb/grid/multires_grid.py b/xlb/grid/multires_grid.py
index 73a2e908..e029602f 100644
--- a/xlb/grid/multires_grid.py
+++ b/xlb/grid/multires_grid.py
@@ -74,13 +74,13 @@ def create_field(
         cardinality: int,
         dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None,
         fill_value=None,
-        neon_data_use: neon.DataUse = neon.DataUse.host_device(),
+        neon_memory_type: neon.MemoryType = neon.MemoryType.host_device(),
     ):
         dtype = dtype.wp_dtype if dtype else DefaultConfig.default_precision_policy.store_precision.wp_dtype
         field = self.grid.new_field(
             cardinality=cardinality,
             dtype=dtype,
-            data_use=neon_data_use,
+            memory_type=neon_memory_type,
         )
         for l in range(self.count_levels):
             if fill_value is None:
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 4b694648..feb40e3d 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -64,11 +64,11 @@ def prepare_fields(self, rho, u, initializer=None):
 
         f_0 = self.grid.create_field(cardinality=self.velocity_set.q,
                                      dtype=self.precision_policy.store_precision,
-                                     neon_data_use=neon.DataUse.host_device())
+                                     neon_memory_type=neon.MemoryType.host_device())
 
         f_1 = self.grid.create_field(cardinality=self.velocity_set.q,
                                      dtype=self.precision_policy.store_precision,
-                                     neon_data_use=neon.DataUse.host_device())
+                                     neon_memory_type=neon.MemoryType.host_device())
 
         missing_mask = self.grid.create_field(cardinality=self.velocity_set.q, dtype=Precision.UINT8)
         bc_mask = self.grid.create_field(cardinality=1, dtype=Precision.UINT8)

From f06157379d9d9829b2680c08155ff37ea8caad4a Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Tue, 22 Jul 2025 17:03:20 +0200
Subject: [PATCH 129/208] Debug: adding function to print neon grid
 information.

---
 xlb/grid/multires_grid.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/xlb/grid/multires_grid.py b/xlb/grid/multires_grid.py
index e029602f..b5821976 100644
--- a/xlb/grid/multires_grid.py
+++ b/xlb/grid/multires_grid.py
@@ -67,6 +67,7 @@ def _initialize_backend(self):
             sparsity_pattern_origins=self.sparsity_pattern_origins,
             stencil=self.neon_stencil,
         )
+        self.grid.print_info()
         pass
 
     def create_field(

From 075343f21f966ab508ce6ab48582c0a5660c42cb Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 22 Jul 2025 17:05:24 -0400
Subject: [PATCH 130/208] Fixed the IO culprit!

---
 xlb/utils/mesher.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index c521c11e..93ad90cf 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -470,6 +470,9 @@ def get_fields_data(self, field_neon_dict):
                 c = self.container(field_neon_dict[field_name], self.field_warp_dict[field_name][level], self.origin_list[level], level)
                 c.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
 
+                # Ensure all operations are complete before converting to JAX and Numpy arrays
+                wp.synchronize()
+
                 # Convert the warp fields to numpy arrays and use level's mask to filter the data
                 mask = self.levels_data[level][0]
                 field_np = np.array(wp.to_jax(self.field_warp_dict[field_name][level]))

From 6390957a6f604bec0ff3ef9d2ab3a11107eab3a6 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 22 Jul 2025 17:05:39 -0400
Subject: [PATCH 131/208] ruff changes

---
 xlb/operator/stepper/nse_multires_stepper.py | 12 ++++++------
 xlb/utils/mesher.py                          |  2 ++
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index feb40e3d..9dbca95f 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -62,13 +62,13 @@ def prepare_fields(self, rho, u, initializer=None):
                 - missing_mask: Mask indicating which populations are missing at boundary nodes
         """
 
-        f_0 = self.grid.create_field(cardinality=self.velocity_set.q,
-                                     dtype=self.precision_policy.store_precision,
-                                     neon_memory_type=neon.MemoryType.host_device())
+        f_0 = self.grid.create_field(
+            cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision, neon_memory_type=neon.MemoryType.host_device()
+        )
 
-        f_1 = self.grid.create_field(cardinality=self.velocity_set.q,
-                                     dtype=self.precision_policy.store_precision,
-                                     neon_memory_type=neon.MemoryType.host_device())
+        f_1 = self.grid.create_field(
+            cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision, neon_memory_type=neon.MemoryType.host_device()
+        )
 
         missing_mask = self.grid.create_field(cardinality=self.velocity_set.q, dtype=Precision.UINT8)
         bc_mask = self.grid.create_field(cardinality=1, dtype=Precision.UINT8)
diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index 93ad90cf..d857741a 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -140,6 +140,8 @@ def __init__(self, field_name_cardinality_dict, levels_data, scale=1, offset=(0.
             Scale factor for the coordinates.
         offset : tuple, optional
             Offset to be applied to the coordinates.
+        store_precision : str, optional
+            The precision policy for storing data.
         """
         # Process the multires geometry and extract coordinates and connectivity in the coordinate system of the finest level
         coordinates, connectivity, level_id_field, total_cells = self.process_geometry(levels_data, scale)

From 3c7ed771e6c11274dee9e3b499739c9092ab13f7 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Wed, 23 Jul 2025 17:28:07 +0200
Subject: [PATCH 132/208] Enhancement: integrate neon skeleton with dGrid
 stepper

---
 examples/performance/mlups_3d.py    | 32 +++++++++++++++++++++++++++--
 xlb/grid/neon_grid.py               |  6 +++---
 xlb/operator/stepper/nse_stepper.py | 20 ++++++++++++++++--
 3 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/examples/performance/mlups_3d.py b/examples/performance/mlups_3d.py
index 1a2ebdae..de67d7b8 100644
--- a/examples/performance/mlups_3d.py
+++ b/examples/performance/mlups_3d.py
@@ -9,6 +9,8 @@
 from xlb.operator.stepper import IncompressibleNavierStokesStepper
 from xlb.operator.boundary_condition import FullwayBounceBackBC, EquilibriumBC
 from xlb.distribute import distribute
+from xlb.operator.macroscopic import Macroscopic
+
 
 # -------------------------- Simulation Setup --------------------------
 
@@ -17,7 +19,7 @@ def parse_arguments():
     parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation (BGK)")
     parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid")
     parser.add_argument("num_steps", type=int, help="Number of timesteps for the simulation")
-    parser.add_argument("compute_backend", type=str, help="Backend for the simulation (jax or warp)")
+    parser.add_argument("compute_backend", type=str, help="Backend for the simulation (jax, warp or neon)")
     parser.add_argument("precision", type=str, help="Precision for the simulation (e.g., fp32/fp32)")
     return parser.parse_args()
 
@@ -80,6 +82,14 @@ def run_simulation(compute_backend, precision_policy, grid_shape, num_steps):
     # Initialize fields
     omega = 1.0
     f_0, f_1, bc_mask, missing_mask = stepper.prepare_fields()
+    if compute_backend == ComputeBackend.NEON:
+        stepper.prepare_skeleton(f_0, f_1, bc_mask, missing_mask, omega)
+
+    # Warp-up iterations
+    for i in range(10):
+        f_0, f_1 = stepper(f_0, f_1, bc_mask, missing_mask, omega, i)
+        f_0, f_1 = f_1, f_0
+    wp.synchronize()
 
     start_time = time.time()
     for i in range(num_steps):
@@ -88,11 +98,29 @@ def run_simulation(compute_backend, precision_policy, grid_shape, num_steps):
     wp.synchronize()
     elapsed_time = time.time() - start_time
 
+    # Define Macroscopic Calculation
+    macro = Macroscopic(
+        compute_backend=compute_backend,
+        precision_policy=precision_policy,
+        velocity_set=xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=compute_backend),
+    )
+    # if compute_backend == ComputeBackend.NEON:
+    #
+    #     rho = grid.create_field(cardinality=1, dtype=precision_policy.store_precision)
+    #     u = grid.create_field(cardinality=3, dtype=precision_policy.store_precision)
+    #
+    #     macro(f_0, rho, u)
+    #
+    #     wp.synchronize()
+    #     u.update_host(0)
+    #     wp.synchronize()
+    #     u.export_vti(f"{"mlups"}{num_steps}.vti", "u")
+
     return elapsed_time
 
 
 def calculate_mlups(cube_edge, num_steps, elapsed_time):
-    total_lattice_updates = cube_edge**3 * num_steps
+    total_lattice_updates = cube_edge ** 3 * num_steps
     mlups = (total_lattice_updates / elapsed_time) / 1e6
     return mlups
 
diff --git a/xlb/grid/neon_grid.py b/xlb/grid/neon_grid.py
index 31a08c30..e0851332 100644
--- a/xlb/grid/neon_grid.py
+++ b/xlb/grid/neon_grid.py
@@ -24,9 +24,9 @@ def _get_velocity_set(self):
 
     def _initialize_backend(self):
         # FIXME@max: for now we hardcode the number of devices to 0
-        num_devs = 1
+        num_devs = 2
         dev_idx_list = list(range(num_devs))
-
+        dev_idx_list = [0,1]
         if len(self.shape) == 2:
             import py_neon
 
@@ -45,7 +45,7 @@ def _initialize_backend(self):
                 self.neon_stencil.append([xval, yval, zval])
 
         self.bk = neon.Backend(runtime=neon.Backend.Runtime.stream, dev_idx_list=dev_idx_list)
-
+        self.bk.info_print()
         self.grid = neon.dense.dGrid(backend=self.bk, dim=self.dim, sparsity=None, stencil=self.neon_stencil)
         pass
 
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 3adb1d00..ecf37bbf 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -512,6 +512,22 @@ def nse_stepper_cl(index: Any):
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_launch(self, f_0, f_1, bc_mask, missing_mask, omega, timestep):
-        c = self.neon_container(f_0, f_1, bc_mask, missing_mask, omega, timestep)
-        c.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
+        self.sk[self.sk_iter].run()
+        self.sk_iter = (self.sk_iter + 1) % 2
         return f_0, f_1
+
+    def prepare_skeleton(self, f_0, f_1, bc_mask, missing_mask, omega):
+        grid = f_0.get_grid()
+        bk = grid.get_backend()
+        self.neon_skeleton = {'odd': {}, 'even': {}}
+        self.neon_skeleton['odd']['container'] = self.neon_container(f_0, f_1, bc_mask, missing_mask, omega, 0)
+        self.neon_skeleton['even']['container'] = self.neon_container(f_1, f_0, bc_mask, missing_mask, omega, 1)
+
+        for key in self.neon_skeleton:
+            self.neon_skeleton[key]['app'] = [self.neon_skeleton[key]['container']]
+            self.neon_skeleton[key]['skeleton'] = neon.Skeleton(backend=bk)
+            self.neon_skeleton[key]['skeleton'].sequence("mres_nse_stepper", self.neon_skeleton[key]['app'])
+
+        self.sk = [self.neon_skeleton['odd']['skeleton'],
+                   self.neon_skeleton['even']['skeleton']]
+        self.sk_iter = 0

From 725f13350273e71af62e7314ed9d58cc6b06c713 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 1 Aug 2025 16:11:10 -0400
Subject: [PATCH 133/208] Added a minimal LBM stepper to the MomentumTransfer
 operator

---
 xlb/operator/force/momentum_transfer.py       | 110 ++++++++++++++----
 .../force/multires_momentum_transfer.py       |   9 +-
 2 files changed, 94 insertions(+), 25 deletions(-)

diff --git a/xlb/operator/force/momentum_transfer.py b/xlb/operator/force/momentum_transfer.py
index 97ba8df3..dbe58f36 100644
--- a/xlb/operator/force/momentum_transfer.py
+++ b/xlb/operator/force/momentum_transfer.py
@@ -12,6 +12,77 @@
 import neon
 
 
+class MomentumTransferLBMStepper(Operator):
+    """
+    This operator is used to apply the streaming and boundary conditions to the post-collision distribution function.
+    Note that for dense and single resolution simulations in XLB, f_0 represents the post-collision distribution
+    function. Therefore, we do not need to apply the collision operator here. See the MultiresStepper function in
+    the multires_momentum_transfer for the multi-resolution case.
+    """
+
+    def __init__(
+        self,
+        no_slip_bc_instance,
+        collision_type=None,
+        velocity_set: VelocitySet = None,
+        precision_policy: PrecisionPolicy = None,
+        compute_backend: ComputeBackend = None,
+    ):
+        self.no_slip_bc_instance = no_slip_bc_instance
+        self.stream = Stream(velocity_set, precision_policy, compute_backend)
+
+        if compute_backend == ComputeBackend.WARP:
+            self.stream_functional = self.stream.warp_functional
+            self.bc_functional = self.no_slip_bc_instance.warp_functional
+        elif compute_backend == ComputeBackend.NEON:
+            self.stream_functional = self.stream.neon_functional
+            self.bc_functional = self.no_slip_bc_instance.neon_functional
+
+        # Call the parent constructor
+        super().__init__(
+            velocity_set,
+            precision_policy,
+            compute_backend,
+        )
+
+    @Operator.register_backend(ComputeBackend.JAX)
+    @partial(jit, static_argnums=(0))
+    def jax_implementation(self, f_0, f_1, bc_mask, missing_mask):
+        # Give the input post-collision populations, streaming once and apply the BC the find post-stream values.
+        f_post_collision = f_0
+        f_post_stream = self.stream(f_post_collision)
+        f_post_stream = self.no_slip_bc_instance(f_post_collision, f_post_stream, bc_mask, missing_mask)
+        return f_post_collision, f_post_stream
+
+    def _construct_warp(self):
+        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
+
+        @wp.func
+        def functional(
+            index: Any,
+            f_0: Any,
+            f_1: Any,
+            _missing_mask: Any,
+        ):
+            # Get the distribution function
+            f_post_collision = _f_vec()
+            for l in range(self.velocity_set.q):
+                f_post_collision[l] = self.compute_dtype(self.read_field(f_0, index, l))
+
+            # Apply streaming (pull method)
+            timestep = 0
+            f_post_stream = self.stream_functional(f_0, index)
+            f_post_stream = self.bc_functional(index, timestep, _missing_mask, f_0, f_1, f_post_collision, f_post_stream)
+            return f_post_collision, f_post_stream
+
+        return functional, None
+
+    def _construct_neon(self):
+        # Use the warp functional for the NEON backend
+        functional, _ = self._construct_warp()
+        return functional, None
+
+
 class MomentumTransfer(Operator):
     """
     An opertor for the momentum exchange method to compute the boundary force vector exerted on the solid geometry
@@ -39,8 +110,16 @@ def __init__(
         precision_policy: PrecisionPolicy = None,
         compute_backend: ComputeBackend = None,
     ):
+        # Assign the no-slip boundary condition instance
         self.no_slip_bc_instance = no_slip_bc_instance
-        self.stream = Stream(velocity_set, precision_policy, compute_backend)
+
+        # Define the **minimal** stepper operator needed for the momentum transfer
+        self.stepper = MomentumTransferLBMStepper(
+            no_slip_bc_instance,
+            velocity_set=velocity_set,
+            precision_policy=precision_policy,
+            compute_backend=compute_backend,
+        )
 
         # Call the parent constructor
         super().__init__(
@@ -49,9 +128,10 @@ def __init__(
             compute_backend,
         )
 
-        # Allocate the force vector (the total integral value will be computed)
-        _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
-        self.force = wp.zeros((1), dtype=_u_vec)
+        if self.compute_backend != ComputeBackend.JAX:
+            # Allocate the force vector (the total integral value will be computed)
+            _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
+            self.force = wp.zeros((1), dtype=_u_vec)
 
     @Operator.register_backend(ComputeBackend.JAX)
     @partial(jit, static_argnums=(0))
@@ -76,9 +156,7 @@ def jax_implementation(self, f_0, f_1, bc_mask, missing_mask):
             The force exerted on the solid geometry at each boundary node.
         """
         # Give the input post-collision populations, streaming once and apply the BC the find post-stream values.
-        f_post_collision = f_0
-        f_post_stream = self.stream(f_post_collision)
-        f_post_stream = self.no_slip_bc_instance(f_post_collision, f_post_stream, bc_mask, missing_mask)
+        f_post_collision, f_post_stream = self.stepper(f_0, f_1, bc_mask, missing_mask)
 
         # Compute momentum transfer
         boundary = bc_mask == self.no_slip_bc_instance.id
@@ -98,7 +176,6 @@ def _construct_warp(self):
         # Set local constants
         _c = self.velocity_set.c
         _opp_indices = self.velocity_set.opp_indices
-        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
         _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
         _missing_mask_vec = wp.vec(self.velocity_set.q, dtype=wp.uint8)
         _no_slip_id = self.no_slip_bc_instance.id
@@ -130,15 +207,8 @@ def functional(
             # If the boundary is an edge then add the momentum transfer
             m = _u_vec()
             if is_edge:
-                # Get the distribution function
-                f_post_collision = _f_vec()
-                for l in range(self.velocity_set.q):
-                    f_post_collision[l] = self.compute_dtype(self.read_field(f_0, index, l))
-
-                # Apply streaming (pull method)
-                timestep = 0
-                f_post_stream = self.stream_functional(f_0, index)
-                f_post_stream = self.no_slip_bc_functional(index, timestep, _missing_mask, f_0, f_1, f_post_collision, f_post_stream)
+                # Apply one **minimal** LBM step
+                f_post_collision, f_post_stream = self.stepper_functional(index, f_0, f_1, _missing_mask)
 
                 # Compute the momentum transfer
                 for d in range(self.velocity_set.d):
@@ -184,8 +254,7 @@ def warp_implementation(self, f_0, f_1, bc_mask, missing_mask):
         self.force *= 0.0
 
         # Define the warp functionals needed for this operation
-        self.stream_functional = self.stream.warp_functional
-        self.no_slip_bc_functional = self.no_slip_bc_instance.warp_functional
+        self.stepper_functional = self.stepper.warp_functional
 
         # Launch the warp kernel
         wp.launch(
@@ -245,8 +314,7 @@ def neon_implementation(
         self.force *= 0.0
 
         # Define the neon functionals needed for this operation
-        self.stream_functional = self.stream.neon_functional
-        self.no_slip_bc_functional = self.no_slip_bc_instance.neon_functional
+        self.stepper_functional = self.stepper.neon_functional
 
         # Launch the neon container
         c = self.neon_container(f_0, f_1, bc_mask, missing_mask, self.force)
diff --git a/xlb/operator/force/multires_momentum_transfer.py b/xlb/operator/force/multires_momentum_transfer.py
index d1db6b07..18e3d0f9 100644
--- a/xlb/operator/force/multires_momentum_transfer.py
+++ b/xlb/operator/force/multires_momentum_transfer.py
@@ -19,6 +19,7 @@ class MultiresMomentumTransfer(MomentumTransfer):
     def __init__(
         self,
         no_slip_bc_instance,
+        collision_type="BGK",
         velocity_set: VelocitySet = None,
         precision_policy: PrecisionPolicy = None,
         compute_backend: ComputeBackend = None,
@@ -59,9 +60,10 @@ def container_launcher(loader: neon.Loader):
 
                 # Important: Note the swap to the order of f_0 and f_1 in the functional call.
                 # This is because the multiresolution simulation first performs collision and then streaming and hence
-                # f_0 refers to the post-streaming distribution function and f_1 refers to the pre-collision distribution function.
+                # f_0 refers to the post-streaming distribution function and f_1 refers to the post-collision distribution function.
                 # This is in contrast to our dense implementations (all backends) where streaming occurs first and is followed by
-                # collision which makes. As a workaround, we can simply swap f_0 and f_1 in the functional call.
+                # collision which makes f_0 post-collision and f_1 post-streaming.
+                # So as a workaround, we can simply swap f_0 and f_1 in the functional call.
 
                 @wp.func
                 def container_kernel(index: Any):
@@ -94,8 +96,7 @@ def neon_implementation(
         self.force *= self.compute_dtype(0.0)
 
         # Define the neon functionals needed for this operation
-        self.stream_functional = self.stream.neon_functional
-        self.no_slip_bc_functional = self.no_slip_bc_instance.neon_functional
+        self.stepper_functional = self.stepper.neon_functional
 
         grid = bc_mask.get_grid()
         for level in range(grid.num_levels):

From d8442b181fa0376e86da33092f7adfca0baf2afd Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 1 Aug 2025 16:14:29 -0400
Subject: [PATCH 134/208] Added force calculation to this example as well.

---
 examples/cfd/flow_past_sphere_3d.py | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/examples/cfd/flow_past_sphere_3d.py b/examples/cfd/flow_past_sphere_3d.py
index 6fc399bb..8758b759 100644
--- a/examples/cfd/flow_past_sphere_3d.py
+++ b/examples/cfd/flow_past_sphere_3d.py
@@ -20,7 +20,7 @@
 
 omega = 1.6
 grid_shape = (512 // 2, 128 // 2, 128 // 2)
-compute_backend = ComputeBackend.WARP
+compute_backend = ComputeBackend.JAX
 precision_policy = PrecisionPolicy.FP32FP32
 velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=compute_backend)
 u_max = 0.04
@@ -127,14 +127,29 @@ def bc_profile_warp(index: wp.vec3i):
     velocity_set=xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=ComputeBackend.JAX),
 )
 
+# Setup Momentum Transfer for Force Calculation
+from xlb.operator.force.momentum_transfer import MomentumTransfer
+
+momentum_transfer = MomentumTransfer(bc_sphere, compute_backend=compute_backend)
+sphere_cross_section = np.pi * sphere_radius**2
+
 
 # Post-Processing Function
-def post_process(step, f_current):
+def post_process(step, f_0, f_1):
+    # Compute lift and drag
+    boundary_force = momentum_transfer(f_0, f_1, bc_mask, missing_mask)
+    drag = boundary_force[0]  # x-direction
+    lift = boundary_force[2]
+    cd = 2.0 * drag / (u_max**2 * sphere_cross_section)
+    cl = 2.0 * lift / (u_max**2 * sphere_cross_section)
+    print(f"CD={cd}, CL={cl}")
+
     # Convert to JAX array if necessary
-    if not isinstance(f_current, jnp.ndarray):
-        f_current = wp.to_jax(f_current)
+    if not isinstance(f_0, jnp.ndarray):
+        f_0 = wp.to_jax(f_0)
+        wp.synchronize()
 
-    rho, u = macro(f_current)
+    rho, u = macro(f_0)
 
     # Remove boundary cells
     u = u[:, 1:-1, 1:-1, 1:-1]
@@ -164,7 +179,7 @@ def post_process(step, f_current):
     if step % post_process_interval == 0 or step == num_steps - 1:
         if compute_backend == ComputeBackend.WARP:
             wp.synchronize()
-        post_process(step, f_0)
+        post_process(step, f_0, f_1)
         end_time = time.time()
         elapsed = end_time - start_time
         print(f"Completed step {step}. Time elapsed for {post_process_interval} steps: {elapsed:.6f} seconds.")

From bc2826e0f290dfea8275aad71ea555153c4556a6 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 1 Aug 2025 17:39:04 -0400
Subject: [PATCH 135/208] Applied the changes to multi-res. This enables
 generic handling of BCs in MRES.

---
 xlb/operator/force/momentum_transfer.py       | 26 +++----
 .../force/multires_momentum_transfer.py       | 75 ++++++++++++++-----
 2 files changed, 69 insertions(+), 32 deletions(-)

diff --git a/xlb/operator/force/momentum_transfer.py b/xlb/operator/force/momentum_transfer.py
index dbe58f36..a32e629b 100644
--- a/xlb/operator/force/momentum_transfer.py
+++ b/xlb/operator/force/momentum_transfer.py
@@ -12,18 +12,18 @@
 import neon
 
 
-class MomentumTransferLBMStepper(Operator):
+class FetchPopulations(Operator):
     """
-    This operator is used to apply the streaming and boundary conditions to the post-collision distribution function.
-    Note that for dense and single resolution simulations in XLB, f_0 represents the post-collision distribution
-    function. Therefore, we do not need to apply the collision operator here. See the MultiresStepper function in
-    the multires_momentum_transfer for the multi-resolution case.
+    This operator is used to get the post-collision and post-streaming populations
+    Note that for dense and single resolution simulations in XLB, the order of operations in the stepper is "stream-then-collide".
+    Therefore, f_0 represents the post-collision values and post_streaming values of the current time step need to be reconstructed
+    by applying the streaming and boundary conditions. These populations are readily available in XLB when using multi-resolution
+    grids because the mres stepper relies on "collide-then-stream".
     """
 
     def __init__(
         self,
         no_slip_bc_instance,
-        collision_type=None,
         velocity_set: VelocitySet = None,
         precision_policy: PrecisionPolicy = None,
         compute_backend: ComputeBackend = None,
@@ -113,8 +113,8 @@ def __init__(
         # Assign the no-slip boundary condition instance
         self.no_slip_bc_instance = no_slip_bc_instance
 
-        # Define the **minimal** stepper operator needed for the momentum transfer
-        self.stepper = MomentumTransferLBMStepper(
+        # Define the needed for the momentum transfer
+        self.fetcher = FetchPopulations(
             no_slip_bc_instance,
             velocity_set=velocity_set,
             precision_policy=precision_policy,
@@ -156,7 +156,7 @@ def jax_implementation(self, f_0, f_1, bc_mask, missing_mask):
             The force exerted on the solid geometry at each boundary node.
         """
         # Give the input post-collision populations, streaming once and apply the BC the find post-stream values.
-        f_post_collision, f_post_stream = self.stepper(f_0, f_1, bc_mask, missing_mask)
+        f_post_collision, f_post_stream = self.fetcher(f_0, f_1, bc_mask, missing_mask)
 
         # Compute momentum transfer
         boundary = bc_mask == self.no_slip_bc_instance.id
@@ -207,8 +207,8 @@ def functional(
             # If the boundary is an edge then add the momentum transfer
             m = _u_vec()
             if is_edge:
-                # Apply one **minimal** LBM step
-                f_post_collision, f_post_stream = self.stepper_functional(index, f_0, f_1, _missing_mask)
+                # fetch the post-collision and post-streaming populations
+                f_post_collision, f_post_stream = self.fetcher_functional(index, f_0, f_1, _missing_mask)
 
                 # Compute the momentum transfer
                 for d in range(self.velocity_set.d):
@@ -254,7 +254,7 @@ def warp_implementation(self, f_0, f_1, bc_mask, missing_mask):
         self.force *= 0.0
 
         # Define the warp functionals needed for this operation
-        self.stepper_functional = self.stepper.warp_functional
+        self.fetcher_functional = self.fetcher.warp_functional
 
         # Launch the warp kernel
         wp.launch(
@@ -314,7 +314,7 @@ def neon_implementation(
         self.force *= 0.0
 
         # Define the neon functionals needed for this operation
-        self.stepper_functional = self.stepper.neon_functional
+        self.fetcher_functional = self.fetcher.neon_functional
 
         # Launch the neon container
         c = self.neon_container(f_0, f_1, bc_mask, missing_mask, self.force)
diff --git a/xlb/operator/force/multires_momentum_transfer.py b/xlb/operator/force/multires_momentum_transfer.py
index 18e3d0f9..a340f8b4 100644
--- a/xlb/operator/force/multires_momentum_transfer.py
+++ b/xlb/operator/force/multires_momentum_transfer.py
@@ -10,6 +10,52 @@
 from xlb.operator.force import MomentumTransfer
 
 
+class MultiresFetchPopulations(Operator):
+    """
+    This operator is used to get the post-collision and post-streaming populations
+    Note that for dense and single resolution simulations in XLB, the order of operations in the stepper is "stream-then-collide".
+    Therefore, f_0 represents the post-collision values and post_streaming values of the current time step need to be reconstructed
+    by applying the streaming and boundary conditions. These populations are readily available in XLB when using multi-resolution
+    grids because the mres stepper relies on "collide-then-stream".
+    """
+
+    def __init__(
+        self,
+        velocity_set: VelocitySet = None,
+        precision_policy: PrecisionPolicy = None,
+        compute_backend: ComputeBackend = None,
+    ):
+        if compute_backend in [ComputeBackend.JAX, ComputeBackend.WARP]:
+            raise NotImplementedError(f"Operator {self.__class__.__name__} not supported in {compute_backend} backend.")
+
+        # Call the parent constructor
+        super().__init__(
+            velocity_set,
+            precision_policy,
+            compute_backend,
+        )
+
+    def _construct_neon(self):
+        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
+
+        @wp.func
+        def functional(
+            index: Any,
+            f_0: Any,
+            f_1: Any,
+            _missing_mask: Any,
+        ):
+            # Get the distribution function
+            f_post_collision = _f_vec()
+            f_post_stream = _f_vec()
+            for l in range(self.velocity_set.q):
+                f_post_stream[l] = self.compute_dtype(self.read_field(f_0, index, l))
+                f_post_collision[l] = self.compute_dtype(self.read_field(f_1, index, l))
+            return f_post_collision, f_post_stream
+
+        return functional, None
+
+
 class MultiresMomentumTransfer(MomentumTransfer):
     """
     Multiresolution Momentum Transfer operator for computing the force on a multiresolution grid.
@@ -19,23 +65,21 @@ class MultiresMomentumTransfer(MomentumTransfer):
     def __init__(
         self,
         no_slip_bc_instance,
-        collision_type="BGK",
         velocity_set: VelocitySet = None,
         precision_policy: PrecisionPolicy = None,
         compute_backend: ComputeBackend = None,
     ):
+        if compute_backend in [ComputeBackend.JAX, ComputeBackend.WARP]:
+            raise NotImplementedError(f"Operator {self.__class__.__name__} not supported in {compute_backend} backend.")
+
         # Call super
         super().__init__(no_slip_bc_instance, velocity_set, precision_policy, compute_backend)
-        if self.compute_backend in [ComputeBackend.JAX, ComputeBackend.WARP]:
-            raise NotImplementedError(f"Operator {self.__class__.__name__} not supported in {self.compute_backend} backend.")
-
-        # TODO! The current implementation does not support encoding and decoding of mesh distance in f_1!
-        assert not self.no_slip_bc_instance.needs_mesh_distance, "Mesh distance is not supported for Force Calculation!"
 
-        # Print a warning to the user about the boundary voxels
-        print(
-            "WARNING! make sure boundary voxels are all at the same level and not among the transition regions from one level to another. "
-            "Otherwise, the results of force calculation are not correct!\n"
+        # Define the **minimal** stepper operator needed for the multi-res momentum transfer
+        self.fetcher = MultiresFetchPopulations(
+            velocity_set=velocity_set,
+            precision_policy=precision_policy,
+            compute_backend=compute_backend,
         )
 
     def _construct_neon(self):
@@ -58,20 +102,13 @@ def container_launcher(loader: neon.Loader):
                 f_0_pn = loader.get_mres_write_handle(f_0)
                 f_1_pn = loader.get_mres_write_handle(f_1)
 
-                # Important: Note the swap to the order of f_0 and f_1 in the functional call.
-                # This is because the multiresolution simulation first performs collision and then streaming and hence
-                # f_0 refers to the post-streaming distribution function and f_1 refers to the post-collision distribution function.
-                # This is in contrast to our dense implementations (all backends) where streaming occurs first and is followed by
-                # collision which makes f_0 post-collision and f_1 post-streaming.
-                # So as a workaround, we can simply swap f_0 and f_1 in the functional call.
-
                 @wp.func
                 def container_kernel(index: Any):
                     # apply the functional
                     functional(
                         index,
-                        f_1_pn,
                         f_0_pn,
+                        f_1_pn,
                         bc_mask_pn,
                         missing_mask_pn,
                         force,
@@ -96,7 +133,7 @@ def neon_implementation(
         self.force *= self.compute_dtype(0.0)
 
         # Define the neon functionals needed for this operation
-        self.stepper_functional = self.stepper.neon_functional
+        self.fetcher_functional = self.fetcher.neon_functional
 
         grid = bc_mask.get_grid()
         for level in range(grid.num_levels):

From 142c06c5f7a0d884f5500c920e06d00493345d66 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Sat, 2 Aug 2025 16:36:18 +0200
Subject: [PATCH 136/208] Rename  OptimizationType -> MresPerfOptimizationType

---
 xlb/__init__.py                                |  2 +-
 xlb/helper/simulation_manager.py               |  8 ++++----
 ...n_type.py => mres_perf_ptimization_type.py} | 18 +++++++++---------
 3 files changed, 14 insertions(+), 14 deletions(-)
 rename xlb/{optimization_type.py => mres_perf_ptimization_type.py} (80%)

diff --git a/xlb/__init__.py b/xlb/__init__.py
index 7009237e..31c155f5 100644
--- a/xlb/__init__.py
+++ b/xlb/__init__.py
@@ -2,7 +2,7 @@
 from xlb.compute_backend import ComputeBackend as ComputeBackend
 from xlb.precision_policy import PrecisionPolicy as PrecisionPolicy, Precision as Precision
 from xlb.physics_type import PhysicsType as PhysicsType
-from xlb.optimization_type import OptimizationType as OptimizationType
+from xlb.mres_perf_ptimization_type import MresPerfOptimizationType as OptimizationType
 # Config
 from .default_config import init as init, DefaultConfig as DefaultConfig
 
diff --git a/xlb/helper/simulation_manager.py b/xlb/helper/simulation_manager.py
index 7213d14e..647c3773 100644
--- a/xlb/helper/simulation_manager.py
+++ b/xlb/helper/simulation_manager.py
@@ -2,7 +2,7 @@
 import warp as wp
 from xlb.operator.stepper import MultiresIncompressibleNavierStokesStepper
 from xlb.operator.macroscopic import MultiresMacroscopic
-from xlb.optimization_type import OptimizationType
+from xlb.mres_perf_ptimization_type import MresPerfOptimizationType
 
 
 class MultiresSimulationManager(MultiresIncompressibleNavierStokesStepper):
@@ -19,7 +19,7 @@ def __init__(
             forcing_scheme="exact_difference",
             force_vector=None,
             initializer=None,
-            optimization_type: OptimizationType = OptimizationType.NAIVE_COLLIDE_STREAM,
+            optimization_type: MresPerfOptimizationType = MresPerfOptimizationType.NAIVE_COLLIDE_STREAM,
     ):
         super().__init__(grid, boundary_conditions, collision_type, forcing_scheme, force_vector)
 
@@ -186,9 +186,9 @@ def recursion_fused_finest(level,
                 timestep=0,
             )
 
-        if self.optimization_type == OptimizationType.NAIVE_COLLIDE_STREAM:
+        if self.optimization_type == MresPerfOptimizationType.NAIVE_COLLIDE_STREAM:
             recursion_reference(self.count_levels - 1, app=self.app)
-        elif self.optimization_type == OptimizationType.FUSION_AT_FINEST:
+        elif self.optimization_type == MresPerfOptimizationType.FUSION_AT_FINEST:
             recursion_fused_finest(self.count_levels - 1,
                                    app=self.app,
                                    is_self_f1_the_coalescence_dst_field=None,
diff --git a/xlb/optimization_type.py b/xlb/mres_perf_ptimization_type.py
similarity index 80%
rename from xlb/optimization_type.py
rename to xlb/mres_perf_ptimization_type.py
index 0e3bbe62..fbc733aa 100644
--- a/xlb/optimization_type.py
+++ b/xlb/mres_perf_ptimization_type.py
@@ -2,7 +2,7 @@
 from enum import Enum
 
 
-class OptimizationType(Enum):
+class MresPerfOptimizationType(Enum):
     """
     Enumeration of available optimization strategies for the LBM solver.
 
@@ -13,7 +13,7 @@ class OptimizationType(Enum):
     FUSION_AT_FINEST = 1
 
     @staticmethod
-    def from_string(value: str) -> "OptimizationType":
+    def from_string(value: str) -> "MresPerfOptimizationType":
         """
         Parse a string to an OptimizationType.
 
@@ -30,15 +30,15 @@ def from_string(value: str) -> "OptimizationType":
         """
         # Attempt to parse by name (case-insensitive)
         key = value.strip().upper()
-        if key in OptimizationType.__members__:
-            return OptimizationType[key]
+        if key in MresPerfOptimizationType.__members__:
+            return MresPerfOptimizationType[key]
 
         # Attempt to parse by integer value
         try:
             int_value = int(value)
-            return OptimizationType(int_value)
+            return MresPerfOptimizationType(int_value)
         except (ValueError, KeyError):
-            valid_options = ", ".join(f"{member.name}({member.value})" for member in OptimizationType)
+            valid_options = ", ".join(f"{member.name}({member.value})" for member in MresPerfOptimizationType)
             raise argparse.ArgumentTypeError(
                 f"Invalid OptimizationType {value!r}. Choose from: {valid_options}."
             )
@@ -61,11 +61,11 @@ def build_arg_parser() -> argparse.ArgumentParser:
             description="Run the LBM multiresolution simulation with specified optimizations."
         )
         # Dynamically generate help text from enum members
-        valid_options = ", ".join(f"{member.name}({member.value})" for member in OptimizationType)
+        valid_options = ", ".join(f"{member.name}({member.value})" for member in MresPerfOptimizationType)
         parser.add_argument(
             "-o", "--optimization",
-            type=OptimizationType.from_string,
-            default=OptimizationType.NAIVE_COLLIDE_STREAM,
+            type=MresPerfOptimizationType.from_string,
+            default=MresPerfOptimizationType.NAIVE_COLLIDE_STREAM,
             help=f"Select optimization strategy: {valid_options}",
         )
         return parser

From 698e15c1734d2f0acb56a7144157ea19bd3fc825 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Sat, 2 Aug 2025 16:45:55 +0200
Subject: [PATCH 137/208] Add comment to multi-res grid.

---
 xlb/grid/multires_grid.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/xlb/grid/multires_grid.py b/xlb/grid/multires_grid.py
index b5821976..920d2b0c 100644
--- a/xlb/grid/multires_grid.py
+++ b/xlb/grid/multires_grid.py
@@ -67,6 +67,7 @@ def _initialize_backend(self):
             sparsity_pattern_origins=self.sparsity_pattern_origins,
             stencil=self.neon_stencil,
         )
+        # Print grid stats about voxel distribution between levels.
         self.grid.print_info()
         pass
 

From a3f337289b0e9cb44bb4947d7c636f62c29ae43e Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Sat, 2 Aug 2025 16:53:26 +0200
Subject: [PATCH 138/208] Renaming related to mres perf. optimization type
 class.

---
 xlb/__init__.py                  | 2 +-
 xlb/helper/simulation_manager.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/xlb/__init__.py b/xlb/__init__.py
index 31c155f5..7f552066 100644
--- a/xlb/__init__.py
+++ b/xlb/__init__.py
@@ -2,7 +2,7 @@
 from xlb.compute_backend import ComputeBackend as ComputeBackend
 from xlb.precision_policy import PrecisionPolicy as PrecisionPolicy, Precision as Precision
 from xlb.physics_type import PhysicsType as PhysicsType
-from xlb.mres_perf_ptimization_type import MresPerfOptimizationType as OptimizationType
+from xlb.mres_perf_ptimization_type import MresPerfOptimizationType as MresPerfOptimizationType
 # Config
 from .default_config import init as init, DefaultConfig as DefaultConfig
 
diff --git a/xlb/helper/simulation_manager.py b/xlb/helper/simulation_manager.py
index 647c3773..d708360f 100644
--- a/xlb/helper/simulation_manager.py
+++ b/xlb/helper/simulation_manager.py
@@ -19,14 +19,14 @@ def __init__(
             forcing_scheme="exact_difference",
             force_vector=None,
             initializer=None,
-            optimization_type: MresPerfOptimizationType = MresPerfOptimizationType.NAIVE_COLLIDE_STREAM,
+            mres_perf_opt: MresPerfOptimizationType = MresPerfOptimizationType.NAIVE_COLLIDE_STREAM,
     ):
         super().__init__(grid, boundary_conditions, collision_type, forcing_scheme, force_vector)
 
         self.initializer = initializer
         self.omega = omega
         self.count_levels = grid.count_levels
-        self.optimization_type = optimization_type
+        self.mres_perf_opt = mres_perf_opt
         # Create fields
         self.rho = grid.create_field(cardinality=1, dtype=self.precision_policy.store_precision)
         self.u = grid.create_field(cardinality=3, dtype=self.precision_policy.store_precision)
@@ -186,9 +186,9 @@ def recursion_fused_finest(level,
                 timestep=0,
             )
 
-        if self.optimization_type == MresPerfOptimizationType.NAIVE_COLLIDE_STREAM:
+        if self.mres_perf_opt == MresPerfOptimizationType.NAIVE_COLLIDE_STREAM:
             recursion_reference(self.count_levels - 1, app=self.app)
-        elif self.optimization_type == MresPerfOptimizationType.FUSION_AT_FINEST:
+        elif self.mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST:
             recursion_fused_finest(self.count_levels - 1,
                                    app=self.app,
                                    is_self_f1_the_coalescence_dst_field=None,

From a0111a0f8698853bf38466a7fd7ce1ad431788a9 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Sat, 2 Aug 2025 16:59:24 +0200
Subject: [PATCH 139/208] Adding nvtx into requirements.

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 4d0cd2c4..4eceae10 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,5 @@ trimesh
 warp-lang
 numpy-stl
 pydantic
-ruff
\ No newline at end of file
+ruff
+nvtx
\ No newline at end of file

From 38d65d41d9c0107320b46b264a94aaecc5f4e8d1 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Mon, 4 Aug 2025 22:04:11 +0200
Subject: [PATCH 140/208] Uses only device memory for LBM population fields in
 Neon.

Changes the memory type used for fields f_0 and f_1 to device memory only.
---
 xlb/operator/stepper/nse_multires_stepper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index f3051f5d..7851b667 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -63,11 +63,11 @@ def prepare_fields(self, rho, u, initializer=None):
         """
 
         f_0 = self.grid.create_field(
-            cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision, neon_memory_type=neon.MemoryType.host_device()
+            cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision, neon_memory_type=neon.MemoryType.device()
         )
 
         f_1 = self.grid.create_field(
-            cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision, neon_memory_type=neon.MemoryType.host_device()
+            cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision, neon_memory_type=neon.MemoryType.device()
         )
 
         missing_mask = self.grid.create_field(cardinality=self.velocity_set.q, dtype=Precision.UINT8)

From 9ee4988a553836e518aef287c94fd98d31853998 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Wed, 6 Aug 2025 16:50:51 +0200
Subject: [PATCH 141/208] Updating to the latest Neon API.

---
 xlb/operator/stepper/nse_multires_stepper.py | 10 +++++-----
 xlb/utils/mesher.py                          |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 7851b667..a878d618 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -94,7 +94,7 @@ def prepare_fields(self, rho, u, initializer=None):
 
     def prepare_coalescence_count(self, coalescence_factor, bc_mask):
         lattice_central_index = self.velocity_set.center_index
-        num_levels = coalescence_factor.get_grid().get_num_levels()
+        num_levels = coalescence_factor.get_grid().num_levels
 
         @neon.Container.factory(name="sum_kernel_by_level")
         def sum_kernel_by_level(level):
@@ -247,7 +247,7 @@ def _initialize_auxiliary_data(boundary_conditions, f_1, bc_mask, missing_mask):
         """Initialize auxiliary data for boundary conditions that require it."""
         for bc in boundary_conditions:
             if bc.needs_aux_init and not bc.is_initialized_with_aux_data:
-                for level in range(bc_mask.get_grid().get_num_levels()):
+                for level in range(bc_mask.get_grid().num_levels):
                     # Initialize auxiliary data for each level
                     f_1 = bc.multires_aux_data_init(f_1, bc_mask, missing_mask, level=level, stream=0)
         return f_1
@@ -354,12 +354,12 @@ def collide_coarse(
             omega: Any,
             timestep: int,
         ):
-            num_levels = f_0_fd.get_grid().get_num_levels()
+            num_levels = f_0_fd.get_grid().num_levels
 
             def ll_collide_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
-                if level + 1 < f_0_fd.get_grid().get_num_levels():
+                if level + 1 < f_0_fd.get_grid().num_levels:
                     f_0_pn = loader.get_mres_write_handle(f_0_fd, neon.Loader.Operation.stencil_up)
                     f_1_pn = loader.get_mres_write_handle(f_1_fd, neon.Loader.Operation.stencil_up)
                 else:
@@ -430,7 +430,7 @@ def stream_coarse_step_ABC(
             omega: Any,
             timestep: int,
         ):
-            num_levels = f_0_fd.get_grid().get_num_levels()
+            num_levels = f_0_fd.get_grid().num_levels
 
             # if level != 0:
             #     # throw an exception
diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index d857741a..4ee8a5a9 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -444,7 +444,7 @@ def get_fields_data(self, field_neon_dict):
 
         # Ensure that this operator is called on multires grids
         grid_mres = next(iter(field_neon_dict.values())).get_grid()
-        assert grid_mres.get_name() == "mGrid", f"Operation {self.__class__.__name} is only applicable to multi-resolution cases"
+        assert grid_mres.name== "mGrid", f"Operation {self.__class__.__name} is only applicable to multi-resolution cases"
 
         for field_name in field_neon_dict.keys():
             assert field_name in self.field_name_cardinality_dict.keys(), (
@@ -452,7 +452,7 @@ def get_fields_data(self, field_neon_dict):
             )
 
         # number of levels
-        num_levels = grid_mres.get_num_levels()
+        num_levels = grid_mres.num_levels
         assert num_levels == len(self.levels_data), "Error: Inconsistent number of levels!"
 
         # Prepare the fields dictionary to be written by transfering multi-res NEON fields into stacked warp fields and then numpy arrays

From a6cb1115f3ff07b4a98f210beb913d4c5807b0b2 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Thu, 7 Aug 2025 22:36:54 -0400
Subject: [PATCH 142/208] fixed ruff formatting

---
 xlb/__init__.py                              |  1 +
 xlb/helper/simulation_manager.py             | 42 ++++++--------
 xlb/mres_perf_ptimization_type.py            | 12 ++--
 xlb/operator/stepper/nse_multires_stepper.py | 60 ++++++++++++--------
 4 files changed, 59 insertions(+), 56 deletions(-)

diff --git a/xlb/__init__.py b/xlb/__init__.py
index 7f552066..e6986507 100644
--- a/xlb/__init__.py
+++ b/xlb/__init__.py
@@ -3,6 +3,7 @@
 from xlb.precision_policy import PrecisionPolicy as PrecisionPolicy, Precision as Precision
 from xlb.physics_type import PhysicsType as PhysicsType
 from xlb.mres_perf_ptimization_type import MresPerfOptimizationType as MresPerfOptimizationType
+
 # Config
 from .default_config import init as init, DefaultConfig as DefaultConfig
 
diff --git a/xlb/helper/simulation_manager.py b/xlb/helper/simulation_manager.py
index d708360f..d68183a1 100644
--- a/xlb/helper/simulation_manager.py
+++ b/xlb/helper/simulation_manager.py
@@ -11,15 +11,15 @@ class MultiresSimulationManager(MultiresIncompressibleNavierStokesStepper):
     """
 
     def __init__(
-            self,
-            omega,
-            grid,
-            boundary_conditions=[],
-            collision_type="BGK",
-            forcing_scheme="exact_difference",
-            force_vector=None,
-            initializer=None,
-            mres_perf_opt: MresPerfOptimizationType = MresPerfOptimizationType.NAIVE_COLLIDE_STREAM,
+        self,
+        omega,
+        grid,
+        boundary_conditions=[],
+        collision_type="BGK",
+        forcing_scheme="exact_difference",
+        force_vector=None,
+        initializer=None,
+        mres_perf_opt: MresPerfOptimizationType = MresPerfOptimizationType.NAIVE_COLLIDE_STREAM,
     ):
         super().__init__(grid, boundary_conditions, collision_type, forcing_scheme, force_vector)
 
@@ -30,8 +30,7 @@ def __init__(
         # Create fields
         self.rho = grid.create_field(cardinality=1, dtype=self.precision_policy.store_precision)
         self.u = grid.create_field(cardinality=3, dtype=self.precision_policy.store_precision)
-        self.coalescence_factor = grid.create_field(cardinality=self.velocity_set.q,
-                                                    dtype=self.precision_policy.store_precision)
+        self.coalescence_factor = grid.create_field(cardinality=self.velocity_set.q, dtype=self.precision_policy.store_precision)
 
         for level in range(self.count_levels):
             self.u.fill_run(level, 0.0, 0)
@@ -107,10 +106,7 @@ def recursion_reference(level, app):
                 timestep=0,
             )
 
-        def recursion_fused_finest(level,
-                                   app,
-                                   is_self_f1_the_explosion_src_field,
-                                   is_self_f1_the_coalescence_dst_field):
+        def recursion_fused_finest(level, app, is_self_f1_the_explosion_src_field, is_self_f1_the_coalescence_dst_field):
             if level < 0:
                 return
 
@@ -165,13 +161,10 @@ def recursion_fused_finest(level,
             # so is_self_f1_the_explosion_src_field is True
 
             if level - 1 == 0:
-                recursion_fused_finest(level - 1, app, is_self_f1_the_explosion_src_field=True,
-                                       is_self_f1_the_coalescence_dst_field=True)
+                recursion_fused_finest(level - 1, app, is_self_f1_the_explosion_src_field=True, is_self_f1_the_coalescence_dst_field=True)
             else:
-                recursion_fused_finest(level - 1, app, is_self_f1_the_explosion_src_field=None,
-                                       is_self_f1_the_coalescence_dst_field=None)
-                recursion_fused_finest(level - 1, app, is_self_f1_the_explosion_src_field=None,
-                                       is_self_f1_the_coalescence_dst_field=None)
+                recursion_fused_finest(level - 1, app, is_self_f1_the_explosion_src_field=None, is_self_f1_the_coalescence_dst_field=None)
+                recursion_fused_finest(level - 1, app, is_self_f1_the_explosion_src_field=None, is_self_f1_the_coalescence_dst_field=None)
             # Important: swapping of f_0 and f_1 is done here
             print(f"RECURSION Level {level}, stream_coarse_step_ABC")
             self.add_to_app(
@@ -189,10 +182,9 @@ def recursion_fused_finest(level,
         if self.mres_perf_opt == MresPerfOptimizationType.NAIVE_COLLIDE_STREAM:
             recursion_reference(self.count_levels - 1, app=self.app)
         elif self.mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST:
-            recursion_fused_finest(self.count_levels - 1,
-                                   app=self.app,
-                                   is_self_f1_the_coalescence_dst_field=None,
-                                   is_self_f1_the_explosion_src_field=None)
+            recursion_fused_finest(
+                self.count_levels - 1, app=self.app, is_self_f1_the_coalescence_dst_field=None, is_self_f1_the_explosion_src_field=None
+            )
         else:
             raise ValueError(f"Unknown optimization level: {self.opt_level}")
 
diff --git a/xlb/mres_perf_ptimization_type.py b/xlb/mres_perf_ptimization_type.py
index fbc733aa..069c91a1 100644
--- a/xlb/mres_perf_ptimization_type.py
+++ b/xlb/mres_perf_ptimization_type.py
@@ -9,6 +9,7 @@ class MresPerfOptimizationType(Enum):
     Supports parsing from either the enum member name (case-insensitive)
     or its integer value, and provides a method to build the CLI parser.
     """
+
     NAIVE_COLLIDE_STREAM = 0
     FUSION_AT_FINEST = 1
 
@@ -39,9 +40,7 @@ def from_string(value: str) -> "MresPerfOptimizationType":
             return MresPerfOptimizationType(int_value)
         except (ValueError, KeyError):
             valid_options = ", ".join(f"{member.name}({member.value})" for member in MresPerfOptimizationType)
-            raise argparse.ArgumentTypeError(
-                f"Invalid OptimizationType {value!r}. Choose from: {valid_options}."
-            )
+            raise argparse.ArgumentTypeError(f"Invalid OptimizationType {value!r}. Choose from: {valid_options}.")
 
     def __str__(self) -> str:
         """
@@ -57,13 +56,12 @@ def build_arg_parser() -> argparse.ArgumentParser:
         Returns:
             A configured ArgumentParser instance.
         """
-        parser = argparse.ArgumentParser(
-            description="Run the LBM multiresolution simulation with specified optimizations."
-        )
+        parser = argparse.ArgumentParser(description="Run the LBM multiresolution simulation with specified optimizations.")
         # Dynamically generate help text from enum members
         valid_options = ", ".join(f"{member.name}({member.value})" for member in MresPerfOptimizationType)
         parser.add_argument(
-            "-o", "--optimization",
+            "-o",
+            "--optimization",
             type=MresPerfOptimizationType.from_string,
             default=MresPerfOptimizationType.NAIVE_COLLIDE_STREAM,
             help=f"Select optimization strategy: {valid_options}",
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 7851b667..019ade08 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -680,15 +680,15 @@ def cl_stream_coarse(index: Any):
 
         @neon.Container.factory(name="finest_fused_pull")
         def finest_fused_pull(
-                level: int,
-                f_0_fd: Any,
-                f_1_fd: Any,
-                bc_mask_fd: Any,
-                missing_mask_fd: Any,
-                omega: Any,
-                timestep: Any,
-                is_f1_the_explosion_src_field: bool,
-                is_f1_the_coalescence_dst_field: bool,
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+            timestep: Any,
+            is_f1_the_explosion_src_field: bool,
+            is_f1_the_coalescence_dst_field: bool,
         ):
             if level != 0:
                 # throw an exception
@@ -703,7 +703,7 @@ def finest_fused_pull(
             # module op to define odd of even iteration
             # od_or_even = wp.module("odd_or_even", "even")
 
-            def ll_stream_coarse(loader: neon.Loader):
+            def finest_fused_pull_launcher(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
                 if level + 1 < f_0_fd.get_grid().get_num_levels():
@@ -720,7 +720,7 @@ def ll_stream_coarse(loader: neon.Loader):
                 _w = self.velocity_set.w
 
                 @wp.func
-                def cl_stream_coarse(index: Any):
+                def finest_fused_pull_kernel(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id == wp.uint8(255):
                         return
@@ -744,8 +744,7 @@ def cl_stream_coarse(index: Any):
                         pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
 
                         has_ngh_at_same_level = wp.bool(False)
-                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0),
-                                                       has_ngh_at_same_level)
+                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
 
                         # NO finer ngh. in the pull direction (opposite of l)
                         if not has_ngh_at_same_level:
@@ -773,8 +772,7 @@ def cl_stream_coarse(index: Any):
                                     _f_post_stream[l] = exploded_pop
 
                     # do non mres post-streaming corrections
-                    _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn,
-                                              _f_post_collision, _f_post_stream, True)
+                    _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
 
                     _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
                     _feq = self.equilibrium.neon_functional(_rho, _u)
@@ -782,8 +780,7 @@ def cl_stream_coarse(index: Any):
 
                     # Apply post-collision boundary conditions
                     _f_post_collision = apply_bc(
-                        index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream,
-                        _f_post_collision, False
+                        index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False
                     )
 
                     # Apply auxiliary recovery for boundary conditions (swapping) before overwriting f_1
@@ -801,9 +798,9 @@ def cl_stream_coarse(index: Any):
 
                         wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
 
-                loader.declare_kernel(cl_stream_coarse)
+                loader.declare_kernel(finest_fused_pull_kernel)
 
-            return ll_stream_coarse
+            return finest_fused_pull_launcher
 
         @neon.Container.factory(name="stream_coarse_step_C")
         def stream_coarse_step_C(
@@ -862,22 +859,37 @@ def cl_stream_coarse(index: Any):
             "stream_coarse_step_A": stream_coarse_step_A,
             "stream_coarse_step_B": stream_coarse_step_B,
             "stream_coarse_step_C": stream_coarse_step_C,
-            "finest_fused_pull": finest_fused_pull,  # Placeholder for future use
+            "finest_fused_pull": finest_fused_pull,
         }
 
     def launch_container(self, streamId, op_name, mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep):
         self.neon_container[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep).run(0)
 
-    def add_to_app(self, app, op_name, mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep,
+    def add_to_app(
+        self,
+        app,
+        op_name,
+        mres_level,
+        f_0,
+        f_1,
+        bc_mask,
+        missing_mask,
+        omega,
+        timestep,
         is_f1_the_explosion_src_field: bool = None,
-        is_f1_the_coalescence_dst_field    : bool = None):
+        is_f1_the_coalescence_dst_field: bool = None,
+    ):
         nvtx.push_range(f"New Container {op_name}", color="yellow")
         if is_f1_the_explosion_src_field is None:
             app.append(self.neon_container[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep))
         else:
-            app.append(self.neon_container[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep,
-                                                    is_f1_the_explosion_src_field, is_f1_the_coalescence_dst_field))
+            app.append(
+                self.neon_container[op_name](
+                    mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep, is_f1_the_explosion_src_field, is_f1_the_coalescence_dst_field
+                )
+            )
         nvtx.pop_range()
+
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_launch(self, f_0, f_1, bc_mask, missing_mask, omega, timestep):
         c = self.neon_container(f_0, f_1, bc_mask, missing_mask, omega, timestep)

From ec94ff5458a664aa8efc0e7521fb61f3a270f99a Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 8 Aug 2025 20:15:08 -0400
Subject: [PATCH 143/208] minor refactoring

---
 xlb/__init__.py                               |  2 +-
 xlb/helper/simulation_manager.py              | 20 ++++++-------
 ...type.py => mres_perf_optimization_type.py} |  0
 xlb/operator/stepper/nse_multires_stepper.py  | 28 ++++++++-----------
 4 files changed, 20 insertions(+), 30 deletions(-)
 rename xlb/{mres_perf_ptimization_type.py => mres_perf_optimization_type.py} (100%)

diff --git a/xlb/__init__.py b/xlb/__init__.py
index e6986507..12876108 100644
--- a/xlb/__init__.py
+++ b/xlb/__init__.py
@@ -2,7 +2,7 @@
 from xlb.compute_backend import ComputeBackend as ComputeBackend
 from xlb.precision_policy import PrecisionPolicy as PrecisionPolicy, Precision as Precision
 from xlb.physics_type import PhysicsType as PhysicsType
-from xlb.mres_perf_ptimization_type import MresPerfOptimizationType as MresPerfOptimizationType
+from xlb.mres_perf_optimization_type import MresPerfOptimizationType as MresPerfOptimizationType
 
 # Config
 from .default_config import init as init, DefaultConfig as DefaultConfig
diff --git a/xlb/helper/simulation_manager.py b/xlb/helper/simulation_manager.py
index d68183a1..a9c994d0 100644
--- a/xlb/helper/simulation_manager.py
+++ b/xlb/helper/simulation_manager.py
@@ -2,7 +2,7 @@
 import warp as wp
 from xlb.operator.stepper import MultiresIncompressibleNavierStokesStepper
 from xlb.operator.macroscopic import MultiresMacroscopic
-from xlb.mres_perf_ptimization_type import MresPerfOptimizationType
+from xlb.mres_perf_optimization_type import MresPerfOptimizationType
 
 
 class MultiresSimulationManager(MultiresIncompressibleNavierStokesStepper):
@@ -106,7 +106,7 @@ def recursion_reference(level, app):
                 timestep=0,
             )
 
-        def recursion_fused_finest(level, app, is_self_f1_the_explosion_src_field, is_self_f1_the_coalescence_dst_field):
+        def recursion_fused_finest(level, app):
             if level < 0:
                 return
 
@@ -123,8 +123,7 @@ def recursion_fused_finest(level, app, is_self_f1_the_explosion_src_field, is_se
                     missing_mask=self.missing_mask,
                     omega=self.omega,
                     timestep=0,
-                    is_f1_the_explosion_src_field=is_self_f1_the_explosion_src_field,
-                    is_f1_the_coalescence_dst_field=is_self_f1_the_coalescence_dst_field,
+                    is_f1_the_explosion_src_field=True,
                 )
                 self.add_to_app(
                     app=app,
@@ -136,8 +135,7 @@ def recursion_fused_finest(level, app, is_self_f1_the_explosion_src_field, is_se
                     missing_mask=self.missing_mask,
                     omega=self.omega,
                     timestep=0,
-                    is_f1_the_explosion_src_field=not is_self_f1_the_explosion_src_field,
-                    is_f1_the_coalescence_dst_field=not is_self_f1_the_coalescence_dst_field,
+                    is_f1_the_explosion_src_field=False,
                 )
                 return
 
@@ -161,10 +159,10 @@ def recursion_fused_finest(level, app, is_self_f1_the_explosion_src_field, is_se
             # so is_self_f1_the_explosion_src_field is True
 
             if level - 1 == 0:
-                recursion_fused_finest(level - 1, app, is_self_f1_the_explosion_src_field=True, is_self_f1_the_coalescence_dst_field=True)
+                recursion_fused_finest(level - 1, app)
             else:
-                recursion_fused_finest(level - 1, app, is_self_f1_the_explosion_src_field=None, is_self_f1_the_coalescence_dst_field=None)
-                recursion_fused_finest(level - 1, app, is_self_f1_the_explosion_src_field=None, is_self_f1_the_coalescence_dst_field=None)
+                recursion_fused_finest(level - 1, app)
+                recursion_fused_finest(level - 1, app)
             # Important: swapping of f_0 and f_1 is done here
             print(f"RECURSION Level {level}, stream_coarse_step_ABC")
             self.add_to_app(
@@ -182,9 +180,7 @@ def recursion_fused_finest(level, app, is_self_f1_the_explosion_src_field, is_se
         if self.mres_perf_opt == MresPerfOptimizationType.NAIVE_COLLIDE_STREAM:
             recursion_reference(self.count_levels - 1, app=self.app)
         elif self.mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST:
-            recursion_fused_finest(
-                self.count_levels - 1, app=self.app, is_self_f1_the_coalescence_dst_field=None, is_self_f1_the_explosion_src_field=None
-            )
+            recursion_fused_finest(self.count_levels - 1, app=self.app)
         else:
             raise ValueError(f"Unknown optimization level: {self.opt_level}")
 
diff --git a/xlb/mres_perf_ptimization_type.py b/xlb/mres_perf_optimization_type.py
similarity index 100%
rename from xlb/mres_perf_ptimization_type.py
rename to xlb/mres_perf_optimization_type.py
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 019ade08..aef8e0db 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -488,11 +488,11 @@ def cl_stream_coarse(index: Any):
                                 # COULD we have a ngh. at the courser level?
                                 if wp.neon_has_parent(f_0_pn, index):
                                     # YES halo cell on top of us
-                                    has_a_courser_ngh = wp.bool(False)
+                                    has_a_coarser_ngh = wp.bool(False)
                                     exploded_pop = wp.neon_lbm_read_coarser_ngh(
-                                        f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_courser_ngh
+                                        f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
                                     )
-                                    if has_a_courser_ngh:
+                                    if has_a_coarser_ngh:
                                         # Full state:
                                         # NO finer ngh. in the pull direction (opposite of l)
                                         # NO ngh. at the same level
@@ -640,11 +640,11 @@ def cl_stream_coarse(index: Any):
                                 # COULD we have a ngh. at the courser level?
                                 if wp.neon_has_parent(f_0_pn, index):
                                     # YES halo cell on top of us
-                                    has_a_courser_ngh = wp.bool(False)
+                                    has_a_coarser_ngh = wp.bool(False)
                                     exploded_pop = wp.neon_lbm_read_coarser_ngh(
-                                        f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_courser_ngh
+                                        f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
                                     )
-                                    if has_a_courser_ngh:
+                                    if has_a_coarser_ngh:
                                         # Full state:
                                         # NO finer ngh. in the pull direction (opposite of l)
                                         # NO ngh. at the same level
@@ -688,7 +688,6 @@ def finest_fused_pull(
             omega: Any,
             timestep: Any,
             is_f1_the_explosion_src_field: bool,
-            is_f1_the_coalescence_dst_field: bool,
         ):
             if level != 0:
                 # throw an exception
@@ -752,16 +751,16 @@ def finest_fused_pull_kernel(index: Any):
                             # COULD we have a ngh. at the courser level?
                             if wp.neon_has_parent(f_0_pn, index):
                                 # YES halo cell on top of us
-                                has_a_courser_ngh = wp.bool(False)
+                                has_a_coarser_ngh = wp.bool(False)
                                 if is_f1_the_explosion_src_field:
                                     exploded_pop = wp.neon_lbm_read_coarser_ngh(
-                                        f_1_pn, index, pull_direction, l, self.compute_dtype(0), has_a_courser_ngh
+                                        f_1_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
                                     )
                                 else:
                                     exploded_pop = wp.neon_lbm_read_coarser_ngh(
-                                        f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_courser_ngh
+                                        f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
                                     )
-                                if has_a_courser_ngh:
+                                if has_a_coarser_ngh:
                                     # Full state:
                                     # NO finer ngh. in the pull direction (opposite of l)
                                     # NO ngh. at the same level
@@ -877,17 +876,12 @@ def add_to_app(
         omega,
         timestep,
         is_f1_the_explosion_src_field: bool = None,
-        is_f1_the_coalescence_dst_field: bool = None,
     ):
         nvtx.push_range(f"New Container {op_name}", color="yellow")
         if is_f1_the_explosion_src_field is None:
             app.append(self.neon_container[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep))
         else:
-            app.append(
-                self.neon_container[op_name](
-                    mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep, is_f1_the_explosion_src_field, is_f1_the_coalescence_dst_field
-                )
-            )
+            app.append(self.neon_container[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep, is_f1_the_explosion_src_field))
         nvtx.pop_range()
 
     @Operator.register_backend(ComputeBackend.NEON)

From 2c39a6736bd82fc33b478bafc4c2779d349c9613 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Sat, 9 Aug 2025 17:36:33 +0200
Subject: [PATCH 144/208] WIP: multi-GPU with Neon

---
 examples/performance/mlups_3d.py              | 22 +++----
 .../indices_boundary_masker.py                | 61 ++++++++++++++++---
 xlb/operator/stepper/nse_stepper.py           |  9 +--
 3 files changed, 68 insertions(+), 24 deletions(-)

diff --git a/examples/performance/mlups_3d.py b/examples/performance/mlups_3d.py
index de67d7b8..1f020c4f 100644
--- a/examples/performance/mlups_3d.py
+++ b/examples/performance/mlups_3d.py
@@ -104,17 +104,17 @@ def run_simulation(compute_backend, precision_policy, grid_shape, num_steps):
         precision_policy=precision_policy,
         velocity_set=xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=compute_backend),
     )
-    # if compute_backend == ComputeBackend.NEON:
-    #
-    #     rho = grid.create_field(cardinality=1, dtype=precision_policy.store_precision)
-    #     u = grid.create_field(cardinality=3, dtype=precision_policy.store_precision)
-    #
-    #     macro(f_0, rho, u)
-    #
-    #     wp.synchronize()
-    #     u.update_host(0)
-    #     wp.synchronize()
-    #     u.export_vti(f"{"mlups"}{num_steps}.vti", "u")
+    if compute_backend == ComputeBackend.NEON:
+
+        rho = grid.create_field(cardinality=1, dtype=precision_policy.store_precision)
+        u = grid.create_field(cardinality=3, dtype=precision_policy.store_precision)
+
+        macro(f_0, rho, u)
+
+        wp.synchronize()
+        u.update_host(0)
+        wp.synchronize()
+        u.export_vti(f"{"mlups"}{num_steps}.vti", "u")
 
     return elapsed_time
 
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index f15ba26b..7d35195f 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -25,10 +25,11 @@ def __init__(
         velocity_set=None,
         precision_policy=None,
         compute_backend=None,
+        grid=None,
     ):
         # Call super
         super().__init__(velocity_set, precision_policy, compute_backend)
-
+        self.grid = grid
         if self.compute_backend in [ComputeBackend.WARP, ComputeBackend.NEON]:
             # Define masker helper functions
             self.helper_masker = HelperFunctionsMasker(
@@ -328,10 +329,31 @@ def _prepare_kernel_inputs(self, bclist, grid_shape):
         is_interior = is_interior[:total_index]
 
         # Convert to Warp arrays
-        wp_bc_indices = wp.array(indices, dtype=wp.int32)
-        wp_id_numbers = wp.array(id_numbers, dtype=wp.uint8)
-        wp_is_interior = wp.array(is_interior, dtype=wp.uint8)
-        return wp_bc_indices, wp_id_numbers, wp_is_interior
+        if self.compute_backend == ComputeBackend.NEON:
+            grid = self.grid
+            ndevice = grid.bk.get_num_devices()
+            if ndevice == 1:
+                wp_bc_indices = wp.array(indices, dtype=wp.int32)
+                wp_id_numbers = wp.array(id_numbers, dtype=wp.uint8)
+                wp_is_interior = wp.array(is_interior, dtype=wp.uint8)
+                return wp_bc_indices, wp_id_numbers, wp_is_interior
+            else:
+                # For multi-device, we need to split the indices across devices
+                wp_bc_indices = []
+                wp_id_numbers = []
+                wp_is_interior = []
+                for i in range(ndevice):
+                    device_name = grid.bk.get_device_name(i)
+                    wp_bc_indices.append(wp.array(indices, dtype=wp.int32, device=device_name))
+                    wp_id_numbers.append(wp.array(id_numbers, dtype=wp.uint8, device=device_name))
+                    wp_is_interior.append(wp.array(is_interior, dtype=wp.uint8, device=device_name))
+                return wp_bc_indices, wp_id_numbers, wp_is_interior
+        else:
+            # Convert to Warp arrays
+            wp_bc_indices = wp.array(indices, dtype=wp.int32)
+            wp_id_numbers = wp.array(id_numbers, dtype=wp.uint8)
+            wp_is_interior = wp.array(is_interior, dtype=wp.uint8)
+            return wp_bc_indices, wp_id_numbers, wp_is_interior
 
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
@@ -385,9 +407,9 @@ def _construct_neon(self):
 
         @neon.Container.factory(name="IndicesBoundaryMasker_DomainBounds")
         def container_domain_bounds(
-            wp_bc_indices,
-            wp_id_numbers,
-            wp_is_interior,
+            wp_bc_indices_,
+            wp_id_numbers_,
+            wp_is_interior_,
             bc_mask,
             missing_mask,
             grid_shape,
@@ -396,7 +418,18 @@ def domain_bounds_launcher(loader: neon.Loader):
                 loader.set_grid(bc_mask.get_grid())
                 bc_mask_pn = loader.get_write_handle(bc_mask)
                 missing_mask_pn = loader.get_write_handle(missing_mask)
-
+                grid = bc_mask.get_grid()
+                bk = grid.backend
+                if bk.get_num_devices() == 1:
+                    # If there is only one device, we can use the warp arrays directly
+                    wp_bc_indices = wp_bc_indices_
+                    wp_id_numbers = wp_id_numbers_
+                    wp_is_interior = wp_is_interior_
+                else:
+                    dev_idx = loader.get_device_id()
+                    wp_bc_indices = wp_bc_indices_[dev_idx]
+                    wp_id_numbers = wp_id_numbers_[dev_idx]
+                    wp_is_interior = wp_is_interior_[dev_idx]
                 @wp.func
                 def domain_bounds_kernel(index: Any):
                     # apply the functional
@@ -497,6 +530,11 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
 
         # If there are no interior boundary conditions, skip the rest and retun early
         if not bc_interior:
+            wp.synchronize()
+            bc_mask.update_host(0)
+            wp.synchronize()
+            bc_mask.export_vti("bc_mask.vti", "m")
+            wp.synchronize()
             return bc_mask, missing_mask
 
         # Prepare the second and third kernel inputs for only a subset of boundary conditions associated with the interior
@@ -514,4 +552,9 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
         )
         container_interior_bc_mask.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
 
+        wp.synchronize()
+        bc_mask.update_host(0)
+        wp.synchronize()
+        bc_mask.export_vti(f"{"bc_mask"}.vti", "u")
+
         return bc_mask, missing_mask
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index ecf37bbf..263d8553 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -107,8 +107,7 @@ def prepare_fields(self, initializer=None):
 
         return f_0, f_1, bc_mask, missing_mask
 
-    @classmethod
-    def _process_boundary_conditions(cls, boundary_conditions, f_1, bc_mask, missing_mask):
+    def _process_boundary_conditions(self, boundary_conditions, f_1, bc_mask, missing_mask):
         """Process boundary conditions and update boundary masks."""
 
         # Check for boundary condition overlaps
@@ -119,6 +118,7 @@ def _process_boundary_conditions(cls, boundary_conditions, f_1, bc_mask, missing
             velocity_set=DefaultConfig.velocity_set,
             precision_policy=DefaultConfig.default_precision_policy,
             compute_backend=DefaultConfig.default_backend,
+            grid=self.grid
         )
 
         # Split boundary conditions by type
@@ -127,6 +127,7 @@ def _process_boundary_conditions(cls, boundary_conditions, f_1, bc_mask, missing
 
         # Process indices-based boundary conditions
         if bc_with_indices:
+            grid = self.get_grid()
             bc_mask, missing_mask = indices_masker(bc_with_indices, bc_mask, missing_mask)
 
         # Process mesh-based boundary conditions for 3D
@@ -468,7 +469,7 @@ def container(
             def nse_stepper_ll(loader: neon.Loader):
                 loader.set_grid(bc_mask_fd.get_grid())
 
-                f_0_pn = loader.get_read_handle(f_0_fd)
+                f_0_pn = loader.get_read_handle(f_0_fd, operation=neon.Loader.Operation.stencil)
                 bc_mask_pn = loader.get_read_handle(bc_mask_fd)
                 missing_mask_pn = loader.get_read_handle(missing_mask_fd)
 
@@ -518,7 +519,7 @@ def neon_launch(self, f_0, f_1, bc_mask, missing_mask, omega, timestep):
 
     def prepare_skeleton(self, f_0, f_1, bc_mask, missing_mask, omega):
         grid = f_0.get_grid()
-        bk = grid.get_backend()
+        bk = grid.backend
         self.neon_skeleton = {'odd': {}, 'even': {}}
         self.neon_skeleton['odd']['container'] = self.neon_container(f_0, f_1, bc_mask, missing_mask, omega, 0)
         self.neon_skeleton['even']['container'] = self.neon_container(f_1, f_0, bc_mask, missing_mask, omega, 1)

From 45f0e963042a2945c2834aec43c79199af537990 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Sat, 9 Aug 2025 12:01:41 -0400
Subject: [PATCH 145/208] Added force calculation to multires even when mesh
 distance is used. Only works with FUSION_AT_FINEST for now.

---
 .../cuboid_flow_past_sphere_3d.py             | 13 +--
 xlb/mres_perf_optimization_type.py            |  6 +-
 xlb/operator/force/momentum_transfer.py       | 44 ++++++++++-
 .../force/multires_momentum_transfer.py       | 79 ++++++-------------
 4 files changed, 76 insertions(+), 66 deletions(-)

diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
index 51e1dbe5..355c11e0 100644
--- a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
@@ -206,10 +206,10 @@ def bc_profile_warp(index: wp.vec3i):
 # bc_ground = FullwayBounceBackBC(indices=grid.boundary_indices_across_levels(level_data, box_side="front"))
 # bc_outlet = ExtrapolationOutflowBC(indices=outlet)
 bc_outlet = DoNothingBC(indices=outlet)
-bc_sphere = HalfwayBounceBackBC(mesh_vertices=sphere, voxelization_method=MeshVoxelizationMethod.AABB)
-# bc_sphere = HybridBC(
-#     bc_method="nonequilibrium_regularized", mesh_vertices=sphere, voxelization_method=MeshVoxelizationMethod.AABB, use_mesh_distance=False
-# )
+# bc_sphere = HalfwayBounceBackBC(mesh_vertices=sphere, voxelization_method=MeshVoxelizationMethod.AABB)
+bc_sphere = HybridBC(
+    bc_method="nonequilibrium_regularized", mesh_vertices=sphere, voxelization_method=MeshVoxelizationMethod.AABB, use_mesh_distance=True
+)
 
 boundary_conditions = [bc_walls, bc_left, bc_outlet, bc_sphere]
 
@@ -235,11 +235,12 @@ def bc_profile_warp(index: wp.vec3i):
     boundary_conditions=boundary_conditions,
     collision_type="KBC",
     initializer=initializer,
+    mres_perf_opt=xlb.mres_perf_optimization_type.MresPerfOptimizationType.FUSION_AT_FINEST,
 )
 
 # Setup Momentum Transfer for Force Calculation
-bc_sphre = boundary_conditions[-1]
-momentum_transfer = MultiresMomentumTransfer(bc_sphere, compute_backend=compute_backend)
+bc_sphere = boundary_conditions[-1]
+momentum_transfer = MultiresMomentumTransfer(bc_sphere, mres_perf_opt=sim.mres_perf_opt, compute_backend=compute_backend)
 
 
 def print_lift_drag(sim):
diff --git a/xlb/mres_perf_optimization_type.py b/xlb/mres_perf_optimization_type.py
index 069c91a1..622982ff 100644
--- a/xlb/mres_perf_optimization_type.py
+++ b/xlb/mres_perf_optimization_type.py
@@ -1,5 +1,5 @@
 import argparse
-from enum import Enum
+from enum import Enum, auto
 
 
 class MresPerfOptimizationType(Enum):
@@ -10,8 +10,8 @@ class MresPerfOptimizationType(Enum):
     or its integer value, and provides a method to build the CLI parser.
     """
 
-    NAIVE_COLLIDE_STREAM = 0
-    FUSION_AT_FINEST = 1
+    NAIVE_COLLIDE_STREAM = auto()
+    FUSION_AT_FINEST = auto()
 
     @staticmethod
     def from_string(value: str) -> "MresPerfOptimizationType":
diff --git a/xlb/operator/force/momentum_transfer.py b/xlb/operator/force/momentum_transfer.py
index a32e629b..1ea80555 100644
--- a/xlb/operator/force/momentum_transfer.py
+++ b/xlb/operator/force/momentum_transfer.py
@@ -3,6 +3,7 @@
 from jax import jit, lax
 import warp as wp
 from typing import Any
+from enum import Enum, auto
 
 from xlb.velocity_set.velocity_set import VelocitySet
 from xlb.precision_policy import PrecisionPolicy
@@ -12,6 +13,18 @@
 import neon
 
 
+# Enum used to keep track of LBM operations
+class LBMOperationSequence(Enum):
+    """
+    Note that for dense and single resolution simulations in XLB, the order of operations in the stepper is "stream-then-collide".
+    For MultiRes stepper however the order of operations is always "collide-then-stream" except at the finest level when the FUSION_AT_FINEST optimization is used.
+    In that case the order of operations is "stream-then-collide" ONLY at the finest level.
+    """
+
+    STREAM_THEN_COLLIDE = auto()
+    COLLIDE_THEN_STREAM = auto()
+
+
 class FetchPopulations(Operator):
     """
     This operator is used to get the post-collision and post-streaming populations
@@ -24,12 +37,14 @@ class FetchPopulations(Operator):
     def __init__(
         self,
         no_slip_bc_instance,
+        operation_sequence: LBMOperationSequence = LBMOperationSequence.STREAM_THEN_COLLIDE,
         velocity_set: VelocitySet = None,
         precision_policy: PrecisionPolicy = None,
         compute_backend: ComputeBackend = None,
     ):
         self.no_slip_bc_instance = no_slip_bc_instance
         self.stream = Stream(velocity_set, precision_policy, compute_backend)
+        self.operation_sequence = operation_sequence
 
         if compute_backend == ComputeBackend.WARP:
             self.stream_functional = self.stream.warp_functional
@@ -58,7 +73,7 @@ def _construct_warp(self):
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
 
         @wp.func
-        def functional(
+        def functional_stream_then_collide(
             index: Any,
             f_0: Any,
             f_1: Any,
@@ -75,7 +90,27 @@ def functional(
             f_post_stream = self.bc_functional(index, timestep, _missing_mask, f_0, f_1, f_post_collision, f_post_stream)
             return f_post_collision, f_post_stream
 
-        return functional, None
+        @wp.func
+        def functional_collide_then_stream(
+            index: Any,
+            f_0: Any,
+            f_1: Any,
+            _missing_mask: Any,
+        ):
+            # Get the distribution function
+            f_post_collision = _f_vec()
+            f_post_stream = _f_vec()
+            for l in range(self.velocity_set.q):
+                f_post_stream[l] = self.compute_dtype(self.read_field(f_0, index, l))
+                f_post_collision[l] = self.compute_dtype(self.read_field(f_1, index, l))
+            return f_post_collision, f_post_stream
+
+        if self.operation_sequence == LBMOperationSequence.STREAM_THEN_COLLIDE:
+            return functional_stream_then_collide, None
+        elif self.operation_sequence == LBMOperationSequence.COLLIDE_THEN_STREAM:
+            return functional_collide_then_stream, None
+        else:
+            raise ValueError(f"Unknown operation sequence: {self.operation_sequence}")
 
     def _construct_neon(self):
         # Use the warp functional for the NEON backend
@@ -106,16 +141,19 @@ class MomentumTransfer(Operator):
     def __init__(
         self,
         no_slip_bc_instance,
+        operation_sequence: LBMOperationSequence = LBMOperationSequence.STREAM_THEN_COLLIDE,
         velocity_set: VelocitySet = None,
         precision_policy: PrecisionPolicy = None,
         compute_backend: ComputeBackend = None,
     ):
         # Assign the no-slip boundary condition instance
         self.no_slip_bc_instance = no_slip_bc_instance
+        self.operation_sequence = operation_sequence
 
         # Define the needed for the momentum transfer
         self.fetcher = FetchPopulations(
-            no_slip_bc_instance,
+            no_slip_bc_instance=self.no_slip_bc_instance,
+            operation_sequence=self.operation_sequence,
             velocity_set=velocity_set,
             precision_policy=precision_policy,
             compute_backend=compute_backend,
diff --git a/xlb/operator/force/multires_momentum_transfer.py b/xlb/operator/force/multires_momentum_transfer.py
index a340f8b4..683b6dc1 100644
--- a/xlb/operator/force/multires_momentum_transfer.py
+++ b/xlb/operator/force/multires_momentum_transfer.py
@@ -8,52 +8,7 @@
 from xlb.compute_backend import ComputeBackend
 from xlb.operator.operator import Operator
 from xlb.operator.force import MomentumTransfer
-
-
-class MultiresFetchPopulations(Operator):
-    """
-    This operator is used to get the post-collision and post-streaming populations
-    Note that for dense and single resolution simulations in XLB, the order of operations in the stepper is "stream-then-collide".
-    Therefore, f_0 represents the post-collision values and post_streaming values of the current time step need to be reconstructed
-    by applying the streaming and boundary conditions. These populations are readily available in XLB when using multi-resolution
-    grids because the mres stepper relies on "collide-then-stream".
-    """
-
-    def __init__(
-        self,
-        velocity_set: VelocitySet = None,
-        precision_policy: PrecisionPolicy = None,
-        compute_backend: ComputeBackend = None,
-    ):
-        if compute_backend in [ComputeBackend.JAX, ComputeBackend.WARP]:
-            raise NotImplementedError(f"Operator {self.__class__.__name__} not supported in {compute_backend} backend.")
-
-        # Call the parent constructor
-        super().__init__(
-            velocity_set,
-            precision_policy,
-            compute_backend,
-        )
-
-    def _construct_neon(self):
-        _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
-
-        @wp.func
-        def functional(
-            index: Any,
-            f_0: Any,
-            f_1: Any,
-            _missing_mask: Any,
-        ):
-            # Get the distribution function
-            f_post_collision = _f_vec()
-            f_post_stream = _f_vec()
-            for l in range(self.velocity_set.q):
-                f_post_stream[l] = self.compute_dtype(self.read_field(f_0, index, l))
-                f_post_collision[l] = self.compute_dtype(self.read_field(f_1, index, l))
-            return f_post_collision, f_post_stream
-
-        return functional, None
+from xlb.mres_perf_optimization_type import MresPerfOptimizationType
 
 
 class MultiresMomentumTransfer(MomentumTransfer):
@@ -65,23 +20,39 @@ class MultiresMomentumTransfer(MomentumTransfer):
     def __init__(
         self,
         no_slip_bc_instance,
+        mres_perf_opt=MresPerfOptimizationType.NAIVE_COLLIDE_STREAM,
         velocity_set: VelocitySet = None,
         precision_policy: PrecisionPolicy = None,
         compute_backend: ComputeBackend = None,
     ):
+        from xlb.operator.force.momentum_transfer import LBMOperationSequence
+
         if compute_backend in [ComputeBackend.JAX, ComputeBackend.WARP]:
             raise NotImplementedError(f"Operator {self.__class__.__name__} not supported in {compute_backend} backend.")
 
-        # Call super
-        super().__init__(no_slip_bc_instance, velocity_set, precision_policy, compute_backend)
-
-        # Define the **minimal** stepper operator needed for the multi-res momentum transfer
-        self.fetcher = MultiresFetchPopulations(
-            velocity_set=velocity_set,
-            precision_policy=precision_policy,
-            compute_backend=compute_backend,
+        # Set the sequence of operations based on the performance optimization type
+        if mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST:
+            operation_sequence = LBMOperationSequence.STREAM_THEN_COLLIDE
+        elif mres_perf_opt == MresPerfOptimizationType.NAIVE_COLLIDE_STREAM:
+            operation_sequence = LBMOperationSequence.COLLIDE_THEN_STREAM
+        else:
+            raise ValueError(f"Unknown performance optimization type: {mres_perf_opt}")
+
+        # Check if the performance optimization type is compatible with the use of mesh distance
+        if mres_perf_opt != MresPerfOptimizationType.FUSION_AT_FINEST:
+            assert not no_slip_bc_instance.needs_mesh_distance, (
+                "MultiresMomentumTransfer operator does not support mesh distance for performance optimization other than fusion at the finest level."
+            )
+
+        # Print a warning to the user about the boundary voxels
+        print(
+            "WARNING! make sure boundary voxels are all at the same level and not among the transition regions from one level to another. "
+            "Otherwise, the results of force calculation are not correct!\n"
         )
 
+        # Call super
+        super().__init__(no_slip_bc_instance, operation_sequence, velocity_set, precision_policy, compute_backend)
+
     def _construct_neon(self):
         # Use the warp functional for the NEON backend
         functional, _ = self._construct_warp()

From a0fa2cb80a289e0d0b61a0712f70989bcecd2120 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 15 Aug 2025 12:31:52 -0400
Subject: [PATCH 146/208] Fixed large memory usage in
 boundary_indices_across_levels

---
 xlb/grid/multires_grid.py | 69 +++++++++++++++++++++++++++++----------
 1 file changed, 52 insertions(+), 17 deletions(-)

diff --git a/xlb/grid/multires_grid.py b/xlb/grid/multires_grid.py
index 920d2b0c..e8929e66 100644
--- a/xlb/grid/multires_grid.py
+++ b/xlb/grid/multires_grid.py
@@ -112,28 +112,63 @@ def boundary_indices_across_levels(self, level_data, box_side: str = "front", re
         """
         num_levels = len(level_data)
         bc_indices_list = []
+        d = self.velocity_set.d  # Dimensionality (2 or 3)
+
+        # Define side configurations (adjust if your conventions differ)
+        if d == 3:
+            side_config = {
+                "left": {"dim": 0, "value": 0},
+                "right": {"dim": 0, "value": lambda s: s[0] - 1},
+                "front": {"dim": 1, "value": 0},
+                "back": {"dim": 1, "value": lambda s: s[1] - 1},
+                "bottom": {"dim": 2, "value": 0},
+                "top": {"dim": 2, "value": lambda s: s[2] - 1},
+            }
+        elif d == 2:
+            side_config = {
+                "left": {"dim": 0, "value": 0},
+                "right": {"dim": 0, "value": lambda s: s[0] - 1},
+                "bottom": {"dim": 1, "value": 0},
+                "top": {"dim": 1, "value": lambda s: s[1] - 1},
+            }
+        else:
+            raise ValueError(f"Unsupported dimensionality: {d}")
+
+        if box_side not in side_config:
+            raise ValueError(f"Unsupported box_side: {box_side}")
+
         for level in range(num_levels):
-            # Find active indices at this level
             mask = level_data[level][0]
-            origin = level_data[level][2]
-            active_indices = np.nonzero(mask) + origin[:, None]
+            origin = level_data[level][2]  # Assume np.array of shape (d,)
+            grid_shape = self.level_to_shape(level)  # tuple of length d
 
-            # Get bottom indices of the bounding box at this level
-            grid_shape = self.level_to_shape(level)
-            box = self.bounding_box_indices(shape=grid_shape, remove_edges=remove_edges)
-            bc_indices = np.array([box[box_side][i] for i in range(self.velocity_set.d)])
+            conf = side_config[box_side]
+            dim_idx = conf["dim"]
+            grid_bounds = conf["value"](grid_shape) if callable(conf["value"]) else conf["value"]
 
-            # Convert to flat indices
-            bc_indices = np.ravel_multi_index(bc_indices, grid_shape)
-            active_indices = np.ravel_multi_index(active_indices, grid_shape)
+            # Get local indices of active voxels
+            local_coords = np.nonzero(mask)  # Tuple of d arrays, each of length num_active
+            if not local_coords[0].size:
+                bc_indices_list.append([])
+                continue
 
-            # Find common rows
-            common = np.intersect1d(active_indices, bc_indices)
+            # Compute global coords (list of d arrays)
+            global_coords = [local_coords[i] + origin[i] for i in range(d)]
 
-            # Append common points at this level to a list
-            if common.size == 0:
-                bc_indices_list.append([])
+            # Filter: must match grid_bounds along the dimension associated with the selected box_side
+            cond = global_coords[dim_idx] == grid_bounds
+
+            # If remove_edges, exclude perimeter of the face
+            if remove_edges:
+                for i in range(d):
+                    if i != dim_idx:
+                        cond &= (global_coords[i] > 0) & (global_coords[i] < grid_shape[i] - 1)
+
+            # Collect filtered indices
+            if np.any(cond):
+                active_bc = [gc[cond] for gc in global_coords]
+                bc_indices_list.append([arr.tolist() for arr in active_bc])
             else:
-                active_bc_indices = np.unravel_index(common, grid_shape)
-                bc_indices_list.append([arr.tolist() for arr in active_bc_indices])
+                bc_indices_list.append([])
+
         return bc_indices_list

From af49421de9f915b1dffa1139b8605c22fcfbdac9 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Sat, 16 Aug 2025 09:56:24 +0200
Subject: [PATCH 147/208] Refactoring mlups example

---
 examples/performance/mlups_3d.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/examples/performance/mlups_3d.py b/examples/performance/mlups_3d.py
index 1f020c4f..52d74188 100644
--- a/examples/performance/mlups_3d.py
+++ b/examples/performance/mlups_3d.py
@@ -21,13 +21,20 @@ def parse_arguments():
     parser.add_argument("num_steps", type=int, help="Number of timesteps for the simulation")
     parser.add_argument("compute_backend", type=str, help="Backend for the simulation (jax, warp or neon)")
     parser.add_argument("precision", type=str, help="Precision for the simulation (e.g., fp32/fp32)")
+    parser.add_argument("gpu_devices",  type=int, nargs="+",  default=None, help="List of the CUDA devices to use (e.g., -gpu_devices 0 1 2). This is only used for Neon backend.")
+    # add a flat to choose between 19 or 27 velocity set
+    parser.add_argument("--velocity_set", type=str, default="D3Q19", help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)")
+    # add a flat to choose between multi-gpu occ options based on the neon occ: 
+    parser.add_argument("--occ", type=str, default="standard", help="Occupancy for the simulation (standard, extended, twoWayExtended, none) (default: standard)")
+
+
     return parser.parse_args()
 
 
 def setup_simulation(args):
     if args.compute_backend == "jax":
         compute_backend = ComputeBackend.JAX
-    elif args.compute_backend == "warp":
+    elif args.compute_backend == "warp":    
         compute_backend = ComputeBackend.WARP
     elif args.compute_backend == "neon":
         compute_backend = ComputeBackend.NEON
@@ -48,7 +55,10 @@ def setup_simulation(args):
         default_backend=compute_backend,
         default_precision_policy=precision_policy,
     )
-    return compute_backend, precision_policy
+
+
+
+    return compute_backend, precision_policy, device_list
 
 
 def run_simulation(compute_backend, precision_policy, grid_shape, num_steps):
@@ -114,7 +124,7 @@ def run_simulation(compute_backend, precision_policy, grid_shape, num_steps):
         wp.synchronize()
         u.update_host(0)
         wp.synchronize()
-        u.export_vti(f"{"mlups"}{num_steps}.vti", "u")
+        u.export_vti(f"mlups_{num_steps}.vti", "u")
 
     return elapsed_time
 

From e6490b1c4005afdf545a9e319c8dab9366b7a171 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Sat, 16 Aug 2025 09:56:56 +0200
Subject: [PATCH 148/208] Refactoring mlups example

---
 examples/performance/mlups_3d.py | 43 ++++++++++++++++++++++++++------
 1 file changed, 36 insertions(+), 7 deletions(-)

diff --git a/examples/performance/mlups_3d.py b/examples/performance/mlups_3d.py
index 52d74188..ee012fa1 100644
--- a/examples/performance/mlups_3d.py
+++ b/examples/performance/mlups_3d.py
@@ -25,13 +25,11 @@ def parse_arguments():
     # add a flat to choose between 19 or 27 velocity set
     parser.add_argument("--velocity_set", type=str, default="D3Q19", help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)")
     # add a flat to choose between multi-gpu occ options based on the neon occ: 
-    parser.add_argument("--occ", type=str, default="standard", help="Occupancy for the simulation (standard, extended, twoWayExtended, none) (default: standard)")
-
-
-    return parser.parse_args()
-
-
-def setup_simulation(args):
+    parser.add_argument("--occ", type=str, default="standard", help="Overlapping Communication and Computation option (standard, extended, twoWayExtended, none) (default: standard)")
+    args = parser.parse_args()
+    
+    # Checking the compute backend
+    compute_backend = None
     if args.compute_backend == "jax":
         compute_backend = ComputeBackend.JAX
     elif args.compute_backend == "warp":    
@@ -40,6 +38,19 @@ def setup_simulation(args):
         compute_backend = ComputeBackend.NEON
     else:
         raise ValueError("Invalid compute backend specified. Use 'jax', 'warp', or 'neon'.")
+    
+    args.compute_backend = compute_backend 
+    if args.occ not in ["standard", "extended", "twoWayExtended", "none"]:
+        raise ValueError("Invalid occupancy option. Use 'standard', 'extended', 'twoWayExtended', or 'none'.")
+    
+    # Checking OCC
+    occ = neon.SkeletonConfig.OCC.from_string(args.occ)
+    args.occ = occ
+    if args.gpu_devices is None and args.compute_backend == "neon":
+        print("[Warning] No GPU devices specified. Using default device 0.")
+        args.gpu_devices = [0]
+
+    # Checking precision policy
     precision_policy_map = {
         "fp32/fp32": PrecisionPolicy.FP32FP32,
         "fp64/fp64": PrecisionPolicy.FP64FP64,
@@ -49,6 +60,24 @@ def setup_simulation(args):
     precision_policy = precision_policy_map.get(args.precision)
     if precision_policy is None:
         raise ValueError("Invalid precision specified.")
+    args.precision_policy = precision_policy
+
+    # Checking velocity set
+    if args.velocity_set not in ["D3Q19", "D3Q27"]:
+        raise ValueError("Invalid velocity set. Use 'D3Q19' or 'D3Q27'.")
+    
+    if args.velocity_set == "D3Q19":
+        velocity_set = xlb.velocity_set.D3Q19(precision_policy=args.precision_policy, compute_backend=compute_backend)
+    elif args.velocity_set == "D3Q27":
+        velocity_set = xlb.velocity_set.D3Q27(precision_policy=args.precision_policy, compute_backend=compute_backend)
+    args.velocity_set = velocity_set
+
+    return args
+
+
+def setup_simulation(args):
+
+
 
     xlb.init(
         velocity_set=xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=compute_backend),

From 29a7d8e617c3b37dbf1ff3d4bcec0202e321dd46 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Sat, 16 Aug 2025 10:43:03 +0200
Subject: [PATCH 149/208] Refactoring mlups example

---
 examples/performance/mlups_3d.py | 41 ++++++++++++++++----------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/examples/performance/mlups_3d.py b/examples/performance/mlups_3d.py
index ee012fa1..253f4c5c 100644
--- a/examples/performance/mlups_3d.py
+++ b/examples/performance/mlups_3d.py
@@ -21,7 +21,7 @@ def parse_arguments():
     parser.add_argument("num_steps", type=int, help="Number of timesteps for the simulation")
     parser.add_argument("compute_backend", type=str, help="Backend for the simulation (jax, warp or neon)")
     parser.add_argument("precision", type=str, help="Precision for the simulation (e.g., fp32/fp32)")
-    parser.add_argument("gpu_devices",  type=int, nargs="+",  default=None, help="List of the CUDA devices to use (e.g., -gpu_devices 0 1 2). This is only used for Neon backend.")
+    parser.add_argument("--gpu_devices",  type=int, nargs="+",  default=None, help="List of the CUDA devices to use (e.g., --gpu_devices 0 1 2). This is only used for Neon backend.")
     # add a flat to choose between 19 or 27 velocity set
     parser.add_argument("--velocity_set", type=str, default="D3Q19", help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)")
     # add a flat to choose between multi-gpu occ options based on the neon occ: 
@@ -40,15 +40,17 @@ def parse_arguments():
         raise ValueError("Invalid compute backend specified. Use 'jax', 'warp', or 'neon'.")
     
     args.compute_backend = compute_backend 
+
+    # Checking OCC
     if args.occ not in ["standard", "extended", "twoWayExtended", "none"]:
         raise ValueError("Invalid occupancy option. Use 'standard', 'extended', 'twoWayExtended', or 'none'.")
-    
-    # Checking OCC
-    occ = neon.SkeletonConfig.OCC.from_string(args.occ)
-    args.occ = occ
     if args.gpu_devices is None and args.compute_backend == "neon":
         print("[Warning] No GPU devices specified. Using default device 0.")
         args.gpu_devices = [0]
+    if args.compute_backend == "neon":
+        import neon
+        occ = neon.SkeletonConfig.OCC.from_string(args.occ)
+        args.occ = occ
 
     # Checking precision policy
     precision_policy_map = {
@@ -75,23 +77,21 @@ def parse_arguments():
     return args
 
 
-def setup_simulation(args):
-
-
-
+def init_xlb(args):
     xlb.init(
-        velocity_set=xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=compute_backend),
-        default_backend=compute_backend,
-        default_precision_policy=precision_policy,
+        velocity_set=args.velocity_set,
+        default_backend=args.compute_backend,
+        default_precision_policy=args.precision_policy,
     )
+    options = None
+    if args.compute_backend == ComputeBackend.NEON:
+        neon_options = {'occ': args.occ, 'device_list': args.gpu_devices}
+        options = neon_options
+    return args.compute_backend, args.precision_policy, options
 
 
-
-    return compute_backend, precision_policy, device_list
-
-
-def run_simulation(compute_backend, precision_policy, grid_shape, num_steps):
-    grid = grid_factory(grid_shape)
+def run_simulation(compute_backend, precision_policy, grid_shape, num_steps, options):
+    grid = grid_factory(grid_shape, options=options)
     box = grid.bounding_box_indices()
     box_no_edge = grid.bounding_box_indices(remove_edges=True)
 
@@ -108,6 +108,7 @@ def run_simulation(compute_backend, precision_policy, grid_shape, num_steps):
         grid=grid,
         boundary_conditions=boundary_conditions,
         collision_type="BGK",
+        options=options,
     )
 
     # Distribute if using JAX
@@ -167,10 +168,10 @@ def calculate_mlups(cube_edge, num_steps, elapsed_time):
 # -------------------------- Simulation Loop --------------------------
 
 args = parse_arguments()
-compute_backend, precision_policy = setup_simulation(args)
+compute_backend, precision_policy, options = init_xlb(args)
 grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
 
-elapsed_time = run_simulation(compute_backend=compute_backend, precision_policy=precision_policy, grid_shape=grid_shape, num_steps=args.num_steps)
+elapsed_time = run_simulation(compute_backend=compute_backend, precision_policy=precision_policy, grid_shape=grid_shape, num_steps=args.num_steps, options=options)
 
 mlups = calculate_mlups(args.cube_edge, args.num_steps, elapsed_time)
 

From 9b12cd6f1769f2b1e0a8236248e08a719e32ec27 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Sat, 16 Aug 2025 10:54:17 +0200
Subject: [PATCH 150/208] Refactoring mlups example

---
 examples/performance/mlups_3d.py | 37 +++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/examples/performance/mlups_3d.py b/examples/performance/mlups_3d.py
index 253f4c5c..1d39061d 100644
--- a/examples/performance/mlups_3d.py
+++ b/examples/performance/mlups_3d.py
@@ -21,13 +21,23 @@ def parse_arguments():
     parser.add_argument("num_steps", type=int, help="Number of timesteps for the simulation")
     parser.add_argument("compute_backend", type=str, help="Backend for the simulation (jax, warp or neon)")
     parser.add_argument("precision", type=str, help="Precision for the simulation (e.g., fp32/fp32)")
-    parser.add_argument("--gpu_devices",  type=int, nargs="+",  default=None, help="List of the CUDA devices to use (e.g., --gpu_devices 0 1 2). This is only used for Neon backend.")
+    parser.add_argument("--gpu_devices", type=str, default=None, help="List of the CUDA devices to use (e.g., --gpu_devices=[0,1,2]). This is only used for Neon backend.")
     # add a flat to choose between 19 or 27 velocity set
     parser.add_argument("--velocity_set", type=str, default="D3Q19", help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)")
     # add a flat to choose between multi-gpu occ options based on the neon occ: 
     parser.add_argument("--occ", type=str, default="standard", help="Overlapping Communication and Computation option (standard, extended, twoWayExtended, none) (default: standard)")
     args = parser.parse_args()
     
+    # Parse gpu_devices string to list
+    if args.gpu_devices is not None:
+        try:
+            import ast
+            args.gpu_devices = ast.literal_eval(args.gpu_devices)
+            if not isinstance(args.gpu_devices, list):
+                args.gpu_devices = [args.gpu_devices]  # Handle single integer case
+        except (ValueError, SyntaxError):
+            raise ValueError("Invalid gpu_devices format. Use format like [0,1,2] or [0]")
+    
     # Checking the compute backend
     compute_backend = None
     if args.compute_backend == "jax":
@@ -74,8 +84,31 @@ def parse_arguments():
         velocity_set = xlb.velocity_set.D3Q27(precision_policy=args.precision_policy, compute_backend=compute_backend)
     args.velocity_set = velocity_set
 
+    print_args(args)
+
     return args
 
+def print_args(args):
+    # Print simulation configuration
+    print("=" * 60)
+    print("           3D LATTICE BOLTZMANN SIMULATION CONFIG")
+    print("=" * 60)
+    print(f"Grid Size:           {args.cube_edge}³ ({args.cube_edge:,} × {args.cube_edge:,} × {args.cube_edge:,})")
+    print(f"Total Lattice Points: {args.cube_edge**3:,}")
+    print(f"Time Steps:          {args.num_steps:,}")
+    print(f"Compute Backend:     {args.compute_backend.name}")
+    print(f"Precision Policy:    {args.precision}")
+    print(f"Velocity Set:        {args.velocity_set.__class__.__name__}")
+
+    if args.compute_backend.name == "NEON":
+        print(f"GPU Devices:         {args.gpu_devices}")
+        # Convert the neon OCC enum back to string for display
+        occ_display = str(args.occ).split('.')[-1] if hasattr(args.occ, '__class__') else args.occ
+        print(f"OCC Strategy:        {occ_display}")
+
+    print("=" * 60)
+    print("Starting simulation...")
+    print()
 
 def init_xlb(args):
     xlb.init(
@@ -171,6 +204,8 @@ def calculate_mlups(cube_edge, num_steps, elapsed_time):
 compute_backend, precision_policy, options = init_xlb(args)
 grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
 
+
+
 elapsed_time = run_simulation(compute_backend=compute_backend, precision_policy=precision_policy, grid_shape=grid_shape, num_steps=args.num_steps, options=options)
 
 mlups = calculate_mlups(args.cube_edge, args.num_steps, elapsed_time)

From 05f395e96b93aa4d2baa50cda2cec613fd06c171 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Sat, 16 Aug 2025 16:13:30 +0200
Subject: [PATCH 151/208] Add new command-line options for MLUPS simulation and
 enhance reporting features

- Introduced options for generating reports, exporting final velocity fields, measuring scalability, and specifying the number of repetitions in the MLUPS simulation.
- Updated argument parsing and output formatting to reflect new options.
- Enhanced simulation summary and scalability analysis with detailed statistics and performance results.
- Refactored simulation functions to accommodate new features and improve clarity.
---
 examples/performance/mlups_3d.py    | 371 +++++++++++++++++++++++++---
 xlb/default_config.py               |   2 +-
 xlb/grid/grid.py                    |   3 +-
 xlb/grid/neon_grid.py               |  47 +++-
 xlb/operator/stepper/nse_stepper.py |   2 +
 xlb/precision_policy.py             |   8 +-
 6 files changed, 381 insertions(+), 52 deletions(-)

diff --git a/examples/performance/mlups_3d.py b/examples/performance/mlups_3d.py
index 1d39061d..9ceb3d34 100644
--- a/examples/performance/mlups_3d.py
+++ b/examples/performance/mlups_3d.py
@@ -26,6 +26,11 @@ def parse_arguments():
     parser.add_argument("--velocity_set", type=str, default="D3Q19", help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)")
     # add a flat to choose between multi-gpu occ options based on the neon occ: 
     parser.add_argument("--occ", type=str, default="standard", help="Overlapping Communication and Computation option (standard, extended, twoWayExtended, none) (default: standard)")
+    parser.add_argument("--report", action="store_true", help="Generate a neon report file (default: disabled)")
+    parser.add_argument("--export_final_velocity", action="store_true", help="Export the final velocity field to a vti file (default: disabled)")
+    parser.add_argument("--measure_scalability", action="store_true", help="Measure scalability of the simulation (default: disabled)")
+    parser.add_argument("--repetitions", type=int, default=1, help="Number of repetitions for the simulation (default: 1) to get the average MLUPs and standard deviation")
+
     args = parser.parse_args()
     
     # Parse gpu_devices string to list
@@ -93,18 +98,21 @@ def print_args(args):
     print("=" * 60)
     print("           3D LATTICE BOLTZMANN SIMULATION CONFIG")
     print("=" * 60)
-    print(f"Grid Size:           {args.cube_edge}³ ({args.cube_edge:,} × {args.cube_edge:,} × {args.cube_edge:,})")
+    print(f"Grid Size:            {args.cube_edge}³ ({args.cube_edge:,} × {args.cube_edge:,} × {args.cube_edge:,})")
     print(f"Total Lattice Points: {args.cube_edge**3:,}")
-    print(f"Time Steps:          {args.num_steps:,}")
-    print(f"Compute Backend:     {args.compute_backend.name}")
-    print(f"Precision Policy:    {args.precision}")
-    print(f"Velocity Set:        {args.velocity_set.__class__.__name__}")
+    print(f"Time Steps:           {args.num_steps:,}")
+    print(f"Compute Backend:      {args.compute_backend.name}")
+    print(f"Precision Policy:     {args.precision}")
+    print(f"Velocity Set:         {args.velocity_set.__class__.__name__}")
+    print(f"Generate Report:      {'Yes' if args.report else 'No'}")
+    print(f"Measure Scalability:  {'Yes' if args.measure_scalability else 'No'}")
+    print(f"Repetitions:          {args.repetitions}")
 
     if args.compute_backend.name == "NEON":
-        print(f"GPU Devices:         {args.gpu_devices}")
+        print(f"GPU Devices:          {args.gpu_devices}")
         # Convert the neon OCC enum back to string for display
         occ_display = str(args.occ).split('.')[-1] if hasattr(args.occ, '__class__') else args.occ
-        print(f"OCC Strategy:        {occ_display}")
+        print(f"OCC Strategy:         {occ_display}")
 
     print("=" * 60)
     print("Starting simulation...")
@@ -123,8 +131,8 @@ def init_xlb(args):
     return args.compute_backend, args.precision_policy, options
 
 
-def run_simulation(compute_backend, precision_policy, grid_shape, num_steps, options):
-    grid = grid_factory(grid_shape, options=options)
+def run_simulation(compute_backend, precision_policy, grid_shape, num_steps, options, export_final_velocity, repetitions, num_devices):
+    grid = grid_factory(grid_shape, backend_config=options)
     box = grid.bounding_box_indices()
     box_no_edge = grid.bounding_box_indices(remove_edges=True)
 
@@ -141,7 +149,7 @@ def run_simulation(compute_backend, precision_policy, grid_shape, num_steps, opt
         grid=grid,
         boundary_conditions=boundary_conditions,
         collision_type="BGK",
-        options=options,
+        backend_config=options,
     )
 
     # Distribute if using JAX
@@ -158,18 +166,24 @@ def run_simulation(compute_backend, precision_policy, grid_shape, num_steps, opt
     if compute_backend == ComputeBackend.NEON:
         stepper.prepare_skeleton(f_0, f_1, bc_mask, missing_mask, omega)
 
+    warmup_iterations = 10
     # Warp-up iterations
-    for i in range(10):
+    for i in range(warmup_iterations):
         f_0, f_1 = stepper(f_0, f_1, bc_mask, missing_mask, omega, i)
         f_0, f_1 = f_1, f_0
     wp.synchronize()
-
-    start_time = time.time()
-    for i in range(num_steps):
-        f_0, f_1 = stepper(f_0, f_1, bc_mask, missing_mask, omega, i)
-        f_0, f_1 = f_1, f_0
-    wp.synchronize()
-    elapsed_time = time.time() - start_time
+    export_num_steps =  warmup_iterations
+
+    elapsed_time_list = []
+    for i in range(repetitions):
+        start_time = time.time()
+        for i in range(num_steps):
+            f_0, f_1 = stepper(f_0, f_1, bc_mask, missing_mask, omega, i)
+            f_0, f_1 = f_1, f_0
+        wp.synchronize()
+        elapsed_time = time.time() - start_time
+        elapsed_time_list.append(elapsed_time)
+        export_num_steps += num_steps
 
     # Define Macroscopic Calculation
     macro = Macroscopic(
@@ -177,19 +191,19 @@ def run_simulation(compute_backend, precision_policy, grid_shape, num_steps, opt
         precision_policy=precision_policy,
         velocity_set=xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=compute_backend),
     )
-    if compute_backend == ComputeBackend.NEON:
 
-        rho = grid.create_field(cardinality=1, dtype=precision_policy.store_precision)
-        u = grid.create_field(cardinality=3, dtype=precision_policy.store_precision)
-
-        macro(f_0, rho, u)
+    if compute_backend == ComputeBackend.NEON:
+        if export_final_velocity:
+            rho = grid.create_field(cardinality=1, dtype=precision_policy.store_precision)
+            u = grid.create_field(cardinality=3, dtype=precision_policy.store_precision)
 
-        wp.synchronize()
-        u.update_host(0)
-        wp.synchronize()
-        u.export_vti(f"mlups_{num_steps}.vti", "u")
+            macro(f_0, rho, u)
+            wp.synchronize()
+            u.update_host(0)
+            wp.synchronize()
+            u.export_vti(f"mlups_3d_size_{grid_shape[0]}_dev_{num_devices}_step_{export_num_steps}.vti", "u")
 
-    return elapsed_time
+    return elapsed_time_list
 
 
 def calculate_mlups(cube_edge, num_steps, elapsed_time):
@@ -198,17 +212,302 @@ def calculate_mlups(cube_edge, num_steps, elapsed_time):
     return mlups
 
 
-# -------------------------- Simulation Loop --------------------------
+def print_summary(args, elapsed_time, mlups):
+    """Print comprehensive simulation summary with parameters and performance results"""
+    total_lattice_points = args.cube_edge ** 3
+    total_lattice_updates = total_lattice_points * args.num_steps
+    lattice_points_per_second = total_lattice_updates / elapsed_time
+    
+    print("\n\n\n" + "=" * 70)
+    print("                    SIMULATION SUMMARY")
+    print("=" * 70)
+    
+    # Simulation Parameters
+    print("SIMULATION PARAMETERS:")
+    print("-" * 25)
+    print(f"  Grid Size:              {args.cube_edge}³ ({args.cube_edge:,} × {args.cube_edge:,} × {args.cube_edge:,})")
+    print(f"  Total Lattice Points:   {total_lattice_points:,}")
+    print(f"  Time Steps:             {args.num_steps:,}")
+    print(f"  Total Lattice Updates:  {total_lattice_updates:,}")
+    print(f"  Compute Backend:        {args.compute_backend.name}")
+    print(f"  Precision Policy:       {args.precision}")
+    print(f"  Velocity Set:           {args.velocity_set.__class__.__name__}")
+    print(f"  Generate Report:        {'Yes' if args.report else 'No'}")
+    print(f"  Measure Scalability:    {'Yes' if args.measure_scalability else 'No'}")
+    
+    if args.compute_backend.name == "NEON":
+        print(f"  GPU Devices:            {args.gpu_devices}")
+        occ_display = str(args.occ).split('.')[-1] if hasattr(args.occ, '__class__') else args.occ
+        print(f"  OCC Strategy:           {occ_display}")
+    
+    print()
+    
+    # Performance Results
+    print("PERFORMANCE RESULTS:")
+    print("-" * 20)
+    print(f"  Time in main loop:      {elapsed_time:.3f} seconds")
+    print(f"  MLUPs:                  {mlups:.2f}")
+    print(f"  Time per LBM step:      {elapsed_time/args.num_steps*1000:.3f} ms")
+    
+    if args.compute_backend.name == "NEON" and len(args.gpu_devices) > 1:
+        mlups_per_gpu = mlups / len(args.gpu_devices)
+        print(f"  MLUPs per GPU:          {mlups_per_gpu:.2f}")
+    
+    print("=" * 70)
 
-args = parse_arguments()
-compute_backend, precision_policy, options = init_xlb(args)
-grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
 
+def print_summary_with_stats(args, stats):
+    """Print comprehensive simulation summary with statistics from multiple repetitions"""
+    total_lattice_points = args.cube_edge ** 3
+    total_lattice_updates = total_lattice_points * args.num_steps
+    
+    mean_mlups = stats['mean_mlups']
+    std_mlups = stats['std_dev_mlups']
+    mean_elapsed_time = stats['mean_elapsed_time']
+    std_elapsed_time = stats['std_dev_elapsed_time']
+    
+    print("\n\n\n" + "=" * 70)
+    print("                    SIMULATION SUMMARY")
+    print("=" * 70)
+    
+    # Simulation Parameters
+    print("SIMULATION PARAMETERS:")
+    print("-" * 25)
+    print(f"  Grid Size:              {args.cube_edge}³ ({args.cube_edge:,} × {args.cube_edge:,} × {args.cube_edge:,})")
+    print(f"  Total Lattice Points:   {total_lattice_points:,}")
+    print(f"  Time Steps:             {args.num_steps:,}")
+    print(f"  Total Lattice Updates:  {total_lattice_updates:,}")
+    print(f"  Repetitions:            {args.repetitions}")
+    print(f"  Compute Backend:        {args.compute_backend.name}")
+    print(f"  Precision Policy:       {args.precision}")
+    print(f"  Velocity Set:           {args.velocity_set.__class__.__name__}")
+    print(f"  Generate Report:        {'Yes' if args.report else 'No'}")
+    print(f"  Measure Scalability:    {'Yes' if args.measure_scalability else 'No'}")
+    
+    if args.compute_backend.name == "NEON":
+        print(f"  GPU Devices:            {args.gpu_devices}")
+        occ_display = str(args.occ).split('.')[-1] if hasattr(args.occ, '__class__') else args.occ
+        print(f"  OCC Strategy:           {occ_display}")
+    
+    print()
+    
+    # Raw Data (if multiple repetitions)
+    if args.repetitions > 1:
+        print("RAW MEASUREMENT DATA:")
+        print("-" * 21)
+        print(f"{'Run':<6} {'Elapsed Time (s)':<18} {'MLUPs':<12} {'Time/Step (ms)':<15}")
+        print("-" * 53)
+        
+        raw_elapsed_times = stats['raw_elapsed_times']
+        raw_mlups = stats['raw_mlups']
+        
+        for i, (elapsed_time, mlups) in enumerate(zip(raw_elapsed_times, raw_mlups)):
+            time_per_step = elapsed_time / args.num_steps * 1000
+            print(f"{i+1:<6} {elapsed_time:<18.3f} {mlups:<12.2f} {time_per_step:<15.3f}")
+        
+        print("-" * 53)
+        print()
+    
+    # Performance Results (Statistical Summary)
+    print("PERFORMANCE RESULTS:")
+    print("-" * 20)
+    if args.repetitions > 1:
+        print(f"  Time in main loop:      {mean_elapsed_time:.3f} ± {std_elapsed_time:.3f} seconds")
+        print(f"  MLUPs:                  {mean_mlups:.2f} ± {std_mlups:.2f}")
+        print(f"  Time per LBM step:      {mean_elapsed_time/args.num_steps*1000:.3f} ± {std_elapsed_time/args.num_steps*1000:.3f} ms")
+    else:
+        print(f"  Time in main loop:      {mean_elapsed_time:.3f} seconds")
+        print(f"  MLUPs:                  {mean_mlups:.2f}")
+        print(f"  Time per LBM step:      {mean_elapsed_time/args.num_steps*1000:.3f} ms")
+    
+    if args.compute_backend.name == "NEON" and len(args.gpu_devices) > 1:
+        mlups_per_gpu = mean_mlups / len(args.gpu_devices)
+        if args.repetitions > 1:
+            mlups_per_gpu_std = std_mlups / len(args.gpu_devices)
+            print(f"  MLUPs per GPU:          {mlups_per_gpu:.2f} ± {mlups_per_gpu_std:.2f}")
+        else:
+            print(f"  MLUPs per GPU:          {mlups_per_gpu:.2f}")
+    
+    print("=" * 70)
 
 
-elapsed_time = run_simulation(compute_backend=compute_backend, precision_policy=precision_policy, grid_shape=grid_shape, num_steps=args.num_steps, options=options)
+def print_scalability_summary(args, stats_list):
+    """Print comprehensive scalability summary with MLUPs statistics for different GPU counts"""
+    total_lattice_points = args.cube_edge ** 3
+    total_lattice_updates = total_lattice_points * args.num_steps
+    
+    print("\n\n\n" + "=" * 95)
+    print("                           SCALABILITY ANALYSIS")
+    print("=" * 95)
+    
+    # Simulation Parameters
+    print("SIMULATION PARAMETERS:")
+    print("-" * 25)
+    print(f"  Grid Size:              {args.cube_edge}³ ({args.cube_edge:,} × {args.cube_edge:,} × {args.cube_edge:,})")
+    print(f"  Total Lattice Points:   {total_lattice_points:,}")
+    print(f"  Time Steps:             {args.num_steps:,}")
+    print(f"  Total Lattice Updates:  {total_lattice_updates:,}")
+    print(f"  Repetitions:            {args.repetitions}")
+    print(f"  Compute Backend:        {args.compute_backend.name}")
+    print(f"  Precision Policy:       {args.precision}")
+    print(f"  Velocity Set:           {args.velocity_set.__class__.__name__}")
+    
+    if args.compute_backend.name == "NEON":
+        occ_display = str(args.occ).split('.')[-1] if hasattr(args.occ, '__class__') else args.occ
+        print(f"  OCC Strategy:           {occ_display}")
+        print(f"  Available GPU Devices:  {args.gpu_devices}")
+    
+    print()
+    
+    # Extract mean MLUPs for calculations
+    mlups_means = [stats['mean_mlups'] for stats in stats_list]
+    baseline_mlups = mlups_means[0] if mlups_means else 0
+    
+    # Scalability Results
+    print("SCALABILITY RESULTS:")
+    print("-" * 20)
+    print(f"{'GPUs':<6} {'MLUPs (mean±std)':<18} {'Speedup':<10} {'Efficiency':<12} {'MLUPs/GPU':<12}")
+    print("-" * 68)
+    
+    for i, stats in enumerate(stats_list):
+        num_gpus = i + 1
+        mean_mlups = stats['mean_mlups']
+        std_mlups = stats['std_dev_mlups']
+        speedup = mean_mlups / baseline_mlups if baseline_mlups > 0 else 0
+        efficiency = (speedup / num_gpus) if num_gpus > 0 else 0
+        mlups_per_gpu = mean_mlups / num_gpus if num_gpus > 0 else 0
+        
+        # Format MLUPs with standard deviation
+        if args.repetitions > 1:
+            mlups_str = f"{mean_mlups:.2f}±{std_mlups:.2f}"
+        else:
+            mlups_str = f"{mean_mlups:.2f}"
+        
+        print(f"{num_gpus:<6} {mlups_str:<18} {speedup:<10.2f} {efficiency:<11.3f} {mlups_per_gpu:<12.2f}")
+    
+    print("-" * 68)
+    
+    # Summary Statistics
+    if len(stats_list) > 1:
+        max_mlups = max(mlups_means)
+        max_mlups_idx = mlups_means.index(max_mlups)
+        max_speedup = max_mlups / baseline_mlups if baseline_mlups > 0 else 0
+        best_efficiency_idx = 0
+        best_efficiency = 0.0
+        
+        for i, mean_mlups in enumerate(mlups_means):
+            num_gpus = i + 1
+            speedup = mean_mlups / baseline_mlups if baseline_mlups > 0 else 0
+            efficiency = (speedup / num_gpus) if num_gpus > 0 else 0
+            if efficiency > best_efficiency:
+                best_efficiency = efficiency
+                best_efficiency_idx = i
+        
+        print()
+        print("SUMMARY STATISTICS:")
+        print("-" * 19)
+        print(f"  Best Performance:       {max_mlups:.2f} MLUPs ({max_mlups_idx + 1} GPUs)")
+        if args.repetitions > 1:
+            max_std = stats_list[max_mlups_idx]['std_dev_mlups']
+            print(f"  Performance Std Dev:    ±{max_std:.2f} MLUPs")
+        print(f"  Maximum Speedup:        {max_speedup:.2f}x")
+        print(f"  Best Efficiency:        {best_efficiency:.3f} ({best_efficiency_idx + 1} GPUs)")
+        print(f"  Scalability Range:      1-{len(stats_list)} GPUs")
+    
+    print("=" * 95)
+
+
+def report(args, stats):
+    import neon
+    report = neon.Report("LBM MLUPS LDC")
+    report.add_member('velocity_set', args.velocity_set.__class__.__name__)
+    report.add_member('compute_backend', args.compute_backend.name)
+    report.add_member('precision_policy', args.precision)
+    report.add_member('grid_size', args.cube_edge)
+    report.add_member('num_steps', args.num_steps)
+    report.add_member('repetitions', args.repetitions)
+    
+    # Statistical measures
+    report.add_member('mean_elapsed_time', stats['mean_elapsed_time'])
+    report.add_member('mean_mlups', stats['mean_mlups'])
+    report.add_member('std_dev_elapsed_time', stats['std_dev_elapsed_time'])
+    report.add_member('std_dev_mlups', stats['std_dev_mlups'])
+    
+    # Raw data vectors (if multiple repetitions)
+    if args.repetitions > 1:
+        report.add_member_vector('raw_elapsed_times', stats['raw_elapsed_times'])
+        report.add_member_vector('raw_mlups', stats['raw_mlups'])
+    
+    # Legacy fields for backwards compatibility
+    report.add_member('elapsed_time', stats['mean_elapsed_time'])
+    report.add_member('mlups', stats['mean_mlups'])
+    
+    report.add_member('occ', args.occ)
+    report.add_member_vector('gpu_devices', args.gpu_devices)
+    report.add_member('num_devices', len(args.gpu_devices))
+    report.add_member('measure_scalability', args.measure_scalability)
+
+    report_name = 'mlups_3d_'+f'size_{args.cube_edge}'
+    if args.measure_scalability:
+        report_name += f'_dev_{len(args.gpu_devices)}'
+    if args.repetitions > 1:
+        report_name += f'_rep_{args.repetitions}'
+    report.write(report_name, True)
+# -------------------------- Simulation Loop --------------------------
 
-mlups = calculate_mlups(args.cube_edge, args.num_steps, elapsed_time)
 
-print(f"Simulation completed in {elapsed_time:.2f} seconds")
-print(f"MLUPs: {mlups:.2f}")
+def benchmark(args):
+    compute_backend, precision_policy, options = init_xlb(args)
+    grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
+
+    elapsed_time_list = []
+    mlups_list = []
+    elapsed_time_list = run_simulation(compute_backend=compute_backend, 
+                                precision_policy=precision_policy, 
+                                grid_shape=grid_shape, 
+                                num_steps=args.num_steps, 
+                                options=options,
+                                export_final_velocity=args.export_final_velocity,
+                                repetitions=args.repetitions,
+                                num_devices=len(args.gpu_devices))
+    
+    for elapsed_time in elapsed_time_list:
+        mlups = calculate_mlups(args.cube_edge, args.num_steps, elapsed_time)
+        mlups_list.append(mlups)
+
+
+    mean_mlups = np.mean(mlups_list)
+    std_dev_mlups = np.std(mlups_list)
+    mean_elapsed_time = np.mean(elapsed_time_list)
+    std_dev_elapsed_time = np.std(elapsed_time_list)
+
+    stats = {'mean_mlups': mean_mlups, 'std_dev_mlups': std_dev_mlups, 'mean_elapsed_time': mean_elapsed_time, 'std_dev_elapsed_time': std_dev_elapsed_time, 'num_devices': len(args.gpu_devices), 'raw_mlups': mlups_list, 'raw_elapsed_times': elapsed_time_list}
+    # Generate report if requested
+    if args.report:
+        report(args, stats)
+        print("Report generated successfully.")
+
+    return stats
+
+def main():
+    args = parse_arguments()
+    if not args.measure_scalability:
+        stats = benchmark(args)
+        # For single run, print_summary expects individual values with additional stats
+        print_summary_with_stats(args, stats)
+        return
+
+    stats_list = []
+    for num_devices in range(1, len(args.gpu_devices) + 1):
+        import copy
+        args_copy = copy.deepcopy(args)
+        args_copy.gpu_devices = args_copy.gpu_devices[:num_devices]
+        stats = benchmark(args_copy)
+        stats_list.append(stats)
+
+    # Print comprehensive scalability analysis
+    print_scalability_summary(args, stats_list)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/xlb/default_config.py b/xlb/default_config.py
index fc24eb4d..a57c1477 100644
--- a/xlb/default_config.py
+++ b/xlb/default_config.py
@@ -1,4 +1,3 @@
-import jax
 from xlb.compute_backend import ComputeBackend
 from dataclasses import dataclass
 from xlb.precision_policy import PrecisionPolicy
@@ -48,6 +47,7 @@ def default_backend() -> ComputeBackend:
 
 
 def check_backend_support():
+    import jax
     if jax.devices()[0].platform == "gpu":
         gpus = jax.devices("gpu")
         if len(gpus) > 1:
diff --git a/xlb/grid/grid.py b/xlb/grid/grid.py
index fd831675..2d03a33a 100644
--- a/xlb/grid/grid.py
+++ b/xlb/grid/grid.py
@@ -11,6 +11,7 @@ def grid_factory(
     shape: Tuple[int, ...],
     compute_backend: ComputeBackend = None,
     velocity_set=None,
+    backend_config=None,
 ):
     compute_backend = compute_backend or DefaultConfig.default_backend
     velocity_set = velocity_set or DefaultConfig.velocity_set
@@ -21,7 +22,7 @@ def grid_factory(
     elif compute_backend == ComputeBackend.NEON:
         from xlb.grid.neon_grid import NeonGrid
 
-        return NeonGrid(shape=shape, velocity_set=velocity_set)
+        return NeonGrid(shape=shape, velocity_set=velocity_set, backend_config=backend_config)
     elif compute_backend == ComputeBackend.JAX:
         from xlb.grid.jax_grid import JaxGrid
 
diff --git a/xlb/grid/neon_grid.py b/xlb/grid/neon_grid.py
index e0851332..b572a4d9 100644
--- a/xlb/grid/neon_grid.py
+++ b/xlb/grid/neon_grid.py
@@ -3,14 +3,38 @@
 from .grid import Grid
 from xlb.precision_policy import Precision
 from xlb.compute_backend import ComputeBackend
-from typing import Literal
+from typing import Literal, List
 from xlb import DefaultConfig
 
 
 class NeonGrid(Grid):
-    def __init__(self, shape, velocity_set):
+    def __init__(self,
+                 shape,  # bounding box of the domain
+                 velocity_set,  # velocity set for the grid
+                 backend_config = None,
+                 ):
         from .warp_grid import WarpGrid
 
+        if backend_config is None:
+            backend_config = {
+                'device_list': [0],
+                'skeleton_config': neon.SkeletonConfig.none(),
+            }
+
+        # check that the config dictionary has the required keys
+        required_keys = ['device_list']
+        for key in required_keys:
+            if key not in backend_config:
+                raise ValueError(f"backend_config must contain a '{key}' key")
+        
+        #check that the device list is a list of integers
+        if not isinstance(backend_config['device_list'], list):
+            raise ValueError(f"backend_config['device_list'] must be a list of integers")
+        for device in backend_config['device_list']:
+            if not isinstance(device, int):
+                raise ValueError(f"backend_config['device_list'] must be a list of integers")
+
+        self.config = backend_config
         self.bk = None
         self.dim = None
         self.grid = None
@@ -22,11 +46,9 @@ def __init__(self, shape, velocity_set):
     def _get_velocity_set(self):
         return self.velocity_set
 
-    def _initialize_backend(self):
-        # FIXME@max: for now we hardcode the number of devices to 0
-        num_devs = 2
-        dev_idx_list = list(range(num_devs))
-        dev_idx_list = [0,1]
+    def _initialize_backend(self):     
+        dev_idx_list = self.config['device_list']
+        
         if len(self.shape) == 2:
             import py_neon
 
@@ -50,10 +72,10 @@ def _initialize_backend(self):
         pass
 
     def create_field(
-        self,
-        cardinality: int,
-        dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None,
-        fill_value=None,
+            self,
+            cardinality: int,
+            dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None,
+            fill_value=None,
     ):
         dtype = dtype.wp_dtype if dtype else DefaultConfig.default_precision_policy.store_precision.wp_dtype
         field = self.grid.new_field(
@@ -68,7 +90,8 @@ def create_field(
         return field
 
     def _create_warp_field(
-        self, cardinality: int, dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None, fill_value=None, ne_field=None
+            self, cardinality: int, dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None,
+            fill_value=None, ne_field=None
     ):
         warp_field = self.warp_grid.create_field(cardinality, dtype, fill_value)
         if ne_field is None:
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 263d8553..b2f929f8 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -39,8 +39,10 @@ def __init__(
         collision_type="BGK",
         forcing_scheme="exact_difference",
         force_vector=None,
+        backend_config=None,
     ):
         super().__init__(grid, boundary_conditions)
+        self.backend_config = backend_config
 
         # Construct the collision operator
         if collision_type == "BGK":
diff --git a/xlb/precision_policy.py b/xlb/precision_policy.py
index 7d31c8a3..f82745a3 100644
--- a/xlb/precision_policy.py
+++ b/xlb/precision_policy.py
@@ -1,8 +1,7 @@
 # Enum for precision policy
 
 from enum import Enum, auto
-import jax.numpy as jnp
-import warp as wp
+
 
 
 class Precision(Enum):
@@ -14,6 +13,7 @@ class Precision(Enum):
 
     @property
     def wp_dtype(self):
+        import warp as wp
         if self == Precision.FP64:
             return wp.float64
         elif self == Precision.FP32:
@@ -29,6 +29,8 @@ def wp_dtype(self):
 
     @property
     def jax_dtype(self):
+        import jax.numpy as jnp
+
         if self == Precision.FP64:
             return jnp.float64
         elif self == Precision.FP32:
@@ -81,9 +83,11 @@ def store_precision(self):
             raise ValueError("Invalid precision policy")
 
     def cast_to_compute_jax(self, array):
+        import jax.numpy as jnp
         compute_precision = self.compute_precision
         return jnp.array(array, dtype=compute_precision.jax_dtype)
 
     def cast_to_store_jax(self, array):
+        import jax.numpy as jnp
         store_precision = self.store_precision
         return jnp.array(array, dtype=store_precision.jax_dtype)

From 390e060725303c87c00702510f0c7d852d24e679 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Sat, 16 Aug 2025 19:32:44 +0200
Subject: [PATCH 152/208] Update MLUPS argument parsing and enhance OCC
 handling in NSE stepper

- Modified command-line argument parsing for the MLUPS simulation to simplify occupancy options.
- Updated the NSE stepper to validate and handle occupancy configuration more robustly.
- Improved readability and clarity in the code by restructuring argument checks and adding comments.
---
 examples/performance/mlups_3d.py    | 15 +++++++--------
 xlb/operator/stepper/nse_stepper.py | 22 +++++++++++++++++++---
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/examples/performance/mlups_3d.py b/examples/performance/mlups_3d.py
index 9ceb3d34..12d4f9c8 100644
--- a/examples/performance/mlups_3d.py
+++ b/examples/performance/mlups_3d.py
@@ -25,7 +25,7 @@ def parse_arguments():
     # add a flat to choose between 19 or 27 velocity set
     parser.add_argument("--velocity_set", type=str, default="D3Q19", help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)")
     # add a flat to choose between multi-gpu occ options based on the neon occ: 
-    parser.add_argument("--occ", type=str, default="standard", help="Overlapping Communication and Computation option (standard, extended, twoWayExtended, none) (default: standard)")
+    parser.add_argument("--occ", type=str, default="standard", help="Overlapping Communication and Computation option (standard, none) (default: standard)")
     parser.add_argument("--report", action="store_true", help="Generate a neon report file (default: disabled)")
     parser.add_argument("--export_final_velocity", action="store_true", help="Export the final velocity field to a vti file (default: disabled)")
     parser.add_argument("--measure_scalability", action="store_true", help="Measure scalability of the simulation (default: disabled)")
@@ -43,7 +43,7 @@ def parse_arguments():
         except (ValueError, SyntaxError):
             raise ValueError("Invalid gpu_devices format. Use format like [0,1,2] or [0]")
     
-    # Checking the compute backend
+    # Checking the compute backend and covert it to the right type
     compute_backend = None
     if args.compute_backend == "jax":
         compute_backend = ComputeBackend.JAX
@@ -53,16 +53,15 @@ def parse_arguments():
         compute_backend = ComputeBackend.NEON
     else:
         raise ValueError("Invalid compute backend specified. Use 'jax', 'warp', or 'neon'.")
-    
     args.compute_backend = compute_backend 
 
     # Checking OCC
-    if args.occ not in ["standard", "extended", "twoWayExtended", "none"]:
-        raise ValueError("Invalid occupancy option. Use 'standard', 'extended', 'twoWayExtended', or 'none'.")
-    if args.gpu_devices is None and args.compute_backend == "neon":
+    if args.occ not in ["standard", "none"]:
+        raise ValueError("Invalid occupancy option. Use 'standard', or 'none'.")
+    if args.gpu_devices is None and args.compute_backend == ComputeBackend.NEON:
         print("[Warning] No GPU devices specified. Using default device 0.")
         args.gpu_devices = [0]
-    if args.compute_backend == "neon":
+    if args.compute_backend == ComputeBackend.NEON:
         import neon
         occ = neon.SkeletonConfig.OCC.from_string(args.occ)
         args.occ = occ
@@ -442,7 +441,7 @@ def report(args, stats):
     report.add_member('elapsed_time', stats['mean_elapsed_time'])
     report.add_member('mlups', stats['mean_mlups'])
     
-    report.add_member('occ', args.occ)
+    report.add_member('occ', (args.occ.to_string() ))
     report.add_member_vector('gpu_devices', args.gpu_devices)
     report.add_member('num_devices', len(args.gpu_devices))
     report.add_member('measure_scalability', args.measure_scalability)
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index b2f929f8..9d44cf8f 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -471,7 +471,11 @@ def container(
             def nse_stepper_ll(loader: neon.Loader):
                 loader.set_grid(bc_mask_fd.get_grid())
 
-                f_0_pn = loader.get_read_handle(f_0_fd, operation=neon.Loader.Operation.stencil)
+                f_0_pn = loader.get_read_handle(
+                    f_0_fd, 
+                    operation=neon.Loader.Operation.stencil, 
+                    discretization = neon.Loader.Discretization.lattice,
+                    )
                 bc_mask_pn = loader.get_read_handle(bc_mask_fd)
                 missing_mask_pn = loader.get_read_handle(missing_mask_fd)
 
@@ -525,11 +529,23 @@ def prepare_skeleton(self, f_0, f_1, bc_mask, missing_mask, omega):
         self.neon_skeleton = {'odd': {}, 'even': {}}
         self.neon_skeleton['odd']['container'] = self.neon_container(f_0, f_1, bc_mask, missing_mask, omega, 0)
         self.neon_skeleton['even']['container'] = self.neon_container(f_1, f_0, bc_mask, missing_mask, omega, 1)
-
+        # check if 'occ' is a valid key
+        if 'occ' not in self.backend_config:
+            occ = neon.SkeletonConfig.none()
+        else:
+            occ = self.backend_config['occ']
+            # check that occ is of type neon.SkeletonConfig.OCC
+            if not isinstance(occ, neon.SkeletonConfig.OCC):
+                print(type(occ))
+                raise ValueError("occ must be of type neon.SkeletonConfig.OCC")
+        
         for key in self.neon_skeleton:
             self.neon_skeleton[key]['app'] = [self.neon_skeleton[key]['container']]
             self.neon_skeleton[key]['skeleton'] = neon.Skeleton(backend=bk)
-            self.neon_skeleton[key]['skeleton'].sequence("mres_nse_stepper", self.neon_skeleton[key]['app'])
+            self.neon_skeleton[key]['skeleton'].sequence(
+                name = "mres_nse_stepper", 
+                containers=self.neon_skeleton[key]['app'],
+                occ = occ)
 
         self.sk = [self.neon_skeleton['odd']['skeleton'],
                    self.neon_skeleton['even']['skeleton']]

From 3b3aa0c4e02262a3df5855058fa6590b7c021a4b Mon Sep 17 00:00:00 2001
From: Apolo Vanderberg <apolo.vanderberg@autodesk.com>
Date: Wed, 6 Aug 2025 14:49:50 -0400
Subject: [PATCH 153/208] fix xdmf path when saving in subfolder

---
 xlb/utils/mesher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index d857741a..d290c3c8 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -288,7 +288,7 @@ def save_xdmf(self, h5_filename, xmf_filename, total_cells, num_points, fields={
                 xmf.write(f'''
             <Attribute Name="{field_name}" AttributeType="Scalar" Center="Cell">
                 <DataItem Dimensions="{total_cells}" NumberType="Float" Precision="4" Format="HDF">
-                {h5_filename}:/Fields/{field_name}
+                {hdf5_rel_path}:/Fields/{field_name}
                 </DataItem>
             </Attribute>
             ''')

From 7282f2bb3f67b129dcc066aaad65215d671ebb78 Mon Sep 17 00:00:00 2001
From: Apolo Vanderberg <apolo.vanderberg@autodesk.com>
Date: Tue, 19 Aug 2025 11:57:17 -0400
Subject: [PATCH 154/208] MultiResIO updates

---
 xlb/utils/mesher.py | 229 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 206 insertions(+), 23 deletions(-)

diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index d290c3c8..1f333e7e 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -162,6 +162,7 @@ def __init__(self, field_name_cardinality_dict, levels_data, scale=1, offset=(0.
         self.connectivity = connectivity
         self.level_id_field = level_id_field
         self.total_cells = total_cells
+        self.centroids = np.mean(coordinates[connectivity], axis=1)
 
         # Set the default precision policy if not provided
         from xlb import DefaultConfig
@@ -531,6 +532,8 @@ def to_slice_image(
         grid_res=512,
         cmap=None,
         component=None,
+        show_axes=False,
+        show_colorbar=False,
         **kwargs,
     ):
         """
@@ -585,6 +588,8 @@ def to_slice_image(
             bounds=bounds,
             grid_res=grid_res,
             cmap=cmap,
+            show_axes=show_axes,
+            show_colorbar=show_colorbar,
             **kwargs,
         )
         print(f"\tSlice image for field {field_name} saved as {output_filename}.png")
@@ -599,6 +604,8 @@ def _to_slice_image_single_field(
         bounds,
         grid_res,
         cmap,
+        show_axes,
+        show_colorbar,
         **kwargs,
     ):
         """
@@ -607,8 +614,8 @@ def _to_slice_image_single_field(
         from matplotlib import cm
         import numpy as np
         import matplotlib.pyplot as plt
-        from scipy.interpolate import griddata
-
+        from scipy.spatial import cKDTree    
+        
         # field data are associated with the cells centers
         cell_values = field_data
 
@@ -616,13 +623,9 @@ def _to_slice_image_single_field(
         plane_normal = np.asarray(np.abs(plane_normal))
         n = plane_normal / np.linalg.norm(plane_normal)
 
-        # Compute centroids (K = 8 for hexahedral cells)
-        cell_points = self.coordinates[self.connectivity]  # shape (M, K, 3)
-        centroids = np.mean(cell_points, axis=1)  # (M, 3)
-
         # Compute signed distances of each cell center to the plane
         plane_point *= plane_normal
-        sdf = np.dot(centroids - plane_point, n)
+        sdf = np.dot(self.centroids - plane_point, n)
 
         # Filter: cells with centroid near plane
         mask = np.abs(sdf) <= slice_thickness / 2
@@ -630,7 +633,7 @@ def _to_slice_image_single_field(
             raise ValueError("No cells intersect the plane within thickness.")
 
         # Project centroids to plane
-        centroids_slice = centroids[mask]
+        centroids_slice = self.centroids[mask]
         sdf_slice = sdf[mask]
         proj = centroids_slice - np.outer(sdf_slice, n)
 
@@ -656,23 +659,203 @@ def _to_slice_image_single_field(
         if cmap is None:
             cmap = cm.nipy_spectral
 
-        # Rasterize: scatter cell centers to 2D grid
-        grid_x = np.linspace(local_x[mask_bounds].min(), local_x[mask_bounds].max(), grid_res)
-        grid_y = np.linspace(local_y[mask_bounds].min(), local_y[mask_bounds].max(), grid_res)
+        # Adjust vertical resolution based on bounds
+        bounded_x_min = local_x[mask_bounds].min()
+        bounded_x_max = local_x[mask_bounds].max()
+        bounded_y_min = local_y[mask_bounds].min()
+        bounded_y_max = local_y[mask_bounds].max()
+        width_x = bounded_x_max - bounded_x_min
+        height_y = bounded_y_max - bounded_y_min        
+        aspect_ratio = height_y / width_x        
+        grid_resY = max(1, int(np.round(grid_res*aspect_ratio)))
+        
+        # Create grid
+        grid_x = np.linspace(bounded_x_min, bounded_x_max, grid_res)
+        grid_y = np.linspace(bounded_y_min, bounded_y_max, grid_resY)
         xv, yv = np.meshgrid(grid_x, grid_y, indexing="xy")
+        
+        # Fast KDTree-based interpolation
+        points = np.column_stack((local_x[mask_bounds], local_y[mask_bounds]))
+        tree = cKDTree(points)
+        
+        # Query points
+        query_points = np.column_stack((xv.ravel(), yv.ravel()))
+        
+        # Find k nearest neighbors for smoother interpolation
+        k = min(4, len(points))  # Use 4 neighbors or less if not enough points
+        distances, indices = tree.query(query_points, k=k, workers=-1) #-1 uses all cores
+        
+        # Inverse distance weighting
+        epsilon = 1e-10
+        weights = 1.0 / (distances + epsilon)
+        weights /= weights.sum(axis=1, keepdims=True)
+        
+        # Interpolate values
+        neighbor_values = values[mask_bounds][indices]
+        grid_field = (neighbor_values * weights).sum(axis=1).reshape(grid_resY, grid_res)
+        
+        # Plot
+        if show_colorbar or show_axes:
+            dpi = 300
+            plt.imshow(
+                grid_field,
+                extent=[bounded_x_min, bounded_x_max, bounded_y_min, bounded_y_max],
+                cmap=cmap,
+                origin="lower",
+                aspect="equal",
+                **kwargs,
+            )        
+            if show_colorbar:
+                plt.colorbar()
+            if not show_axes:
+                plt.axis('off')
+            plt.savefig(output_filename + ".png", dpi=dpi, bbox_inches="tight", pad_inches=0)
+            plt.close()
+        else:
+            plt.imsave(output_filename + ".png", grid_field, cmap=cmap, origin="lower")
+
+    def to_line(self, 
+        output_filename, 
+        field_neon_dict, 
+        start_point, 
+        end_point, 
+        resolution, 
+        component=None,
+        radius=1.0,
+        **kwargs,):
+        """
+        Extract field data along a line between start_point and end_point and save to a CSV file.
 
-        # Linear interpolation for each grid point
-        grid_field = griddata(points=(local_x, local_y), values=values, xi=(xv, yv), method="linear", fill_value=np.nan)
+        This function performs two main steps:
+        1. Extracts field data from field_neon_dict, handling components or computing magnitude.
+        2. Interpolates the field values along a line defined by start_point and end_point,
+        then saves the results (coordinates and field values) to a CSV file.
 
-        # Plot
-        plt.imshow(
-            grid_field,
-            extent=[xmin, xmax, ymin, ymax],
-            cmap=cmap,
-            origin="lower",
-            aspect="equal",
+        Parameters
+        ----------
+        output_filename : str
+            The name of the output CSV file (without extension). Example: "velocity_profile".
+        field_neon_dict : dict
+            A dictionary containing the field data to extract, with a single key-value pair.
+            The key is the field name (e.g., "velocity"), and the value is the NEON data object
+            containing the field values. Example: {"velocity": velocity_neon}.
+        start_point : array_like
+            The starting point of the line in 3D space (e.g., [x0, y0, z0]).
+            Units must match the coordinate system used in the class (voxel units if untransformed,
+            or model units if scale/offset are applied).
+        end_point : array_like
+            The ending point of the line in 3D space (e.g., [x1, y1, z1]).
+            Units must match the coordinate system used in the class.
+        resolution : int
+            The number of points along the line where the field will be interpolated.
+            Example: 100 for 100 evenly spaced points.
+        component : int, optional
+            The specific component of the field to extract (e.g., 0 for x-component, 1 for y-component).
+            If None, the magnitude of the field is computed. Default is None.
+        radius : int
+            The specified distance (in units of the coordinate system) to prefilter and query for line plot 
+
+        Returns
+        -------
+        None
+            The function writes the output to a CSV file and prints a confirmation message.
+
+        Notes
+        -----
+        - The output CSV file will contain columns: 'x', 'y', 'z', and the value of the field name (e.g., 'velocity_x' or 'velocity_magnitude').
+        """
+        
+    
+        # Get the fields data from the NEON fields
+        assert len(field_neon_dict.keys()) == 1, "Error: This function is designed to plot a single field at a time."
+        fields_data = self.get_fields_data(field_neon_dict)
+        
+        # Check if the component is within the valid range
+        if component is None:
+            print("\tCreating csv plot of the field magnitude!")
+            cell_data = list(fields_data.values())
+            squared = [comp**2 for comp in cell_data]
+            cell_data = np.sqrt(sum(squared))
+            field_name = list(fields_data.keys())[0].split("_")[0] + "_magnitude"
+            
+        else:
+            assert component < max(self.field_name_cardinality_dict.values()), (
+                f"Error: Component {component} is out of range for the provided fields."
+            )
+            print(f"\tCreating csv plot for component {component} of the input field!")
+            field_name = list(fields_data.keys())[component]
+            cell_data = fields_data[field_name]
+        
+        if "velocity" in field_name.lower():
+            cell_data = cell_data * self.conversion
+    
+        # Plot each field in the dictionary
+        self._to_line_field(
+            f"{output_filename}_{field_name}",
+            cell_data,
+            start_point,
+            end_point,
+            resolution,
+            radius=radius,
             **kwargs,
         )
-        plt.colorbar()
-        plt.savefig(output_filename + ".png", dpi=300, bbox_inches="tight")
-        plt.close()
+        print(f"\tLine Plot for field {field_name} saved as {output_filename}.csv")
+    
+    def _to_line_field(
+        self, 
+        output_filename, 
+        cell_data, 
+        start_point, 
+        end_point, 
+        resolution,
+        radius,
+        **kwargs,
+        ):
+        """
+        Helper function to create a line plot for a single field.
+        """
+        import numpy as np
+        
+        #cell_points = self.coordinates[self.connectivity]  # Shape: (M, K, 3), where M is num cells, K is nodes per cell
+        #centroids = np.mean(cell_points, axis=1)  # Shape: (M, 3)
+        centroids = self.centroids
+        p0 = np.array(start_point, dtype=np.float32)
+        p1 = np.array(end_point, dtype=np.float32)
+        
+        # direction and parameter t for each centroid
+        d = (p1 - p0)
+        L = np.linalg.norm(d)
+        d_unit = d / L
+        v = centroids - p0
+        t = v.dot(d_unit)
+        closest = p0 + np.outer(t, d_unit)
+        perp_dist = np.linalg.norm(centroids-closest, axis=1)
+
+        # optionally mask to [0,L] or a small perp-radius
+        mask = (t >= 0) & (t <= L) & (perp_dist <= radius)
+        t, data = t[mask], cell_data[mask]
+
+        # sort by t
+        idx = np.argsort(t)
+        t_sorted = t[idx]
+        data_sorted = data[idx]
+
+        # target samples
+        t_line = np.linspace(0, L, resolution)
+
+        # 1D linear interpolation
+        vals_line = np.interp(t_line, t_sorted, data_sorted, left=np.nan, right=np.nan)
+
+        # reconstruct (x,y,z)
+        line_xyz = p0[None,:] + t_line[:,None]*d_unit[None,:]
+
+        # vectorized CSV dump
+        out = np.hstack([line_xyz, vals_line[:,None]])
+        np.savetxt(
+            output_filename + '.csv',
+            out,
+            delimiter=',',
+            header='x,y,z,value',
+            comments=''
+        )
+

From 889cd234e79d6c07d697aefb8e33be300402949d Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 19 Aug 2025 15:09:38 -0400
Subject: [PATCH 155/208] fixed ruff

---
 xlb/utils/mesher.py | 99 +++++++++++++++++++++------------------------
 1 file changed, 45 insertions(+), 54 deletions(-)

diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index 1f333e7e..668cf28c 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -614,8 +614,8 @@ def _to_slice_image_single_field(
         from matplotlib import cm
         import numpy as np
         import matplotlib.pyplot as plt
-        from scipy.spatial import cKDTree    
-        
+        from scipy.spatial import cKDTree
+
         # field data are associated with the cells centers
         cell_values = field_data
 
@@ -665,35 +665,35 @@ def _to_slice_image_single_field(
         bounded_y_min = local_y[mask_bounds].min()
         bounded_y_max = local_y[mask_bounds].max()
         width_x = bounded_x_max - bounded_x_min
-        height_y = bounded_y_max - bounded_y_min        
-        aspect_ratio = height_y / width_x        
-        grid_resY = max(1, int(np.round(grid_res*aspect_ratio)))
-        
+        height_y = bounded_y_max - bounded_y_min
+        aspect_ratio = height_y / width_x
+        grid_resY = max(1, int(np.round(grid_res * aspect_ratio)))
+
         # Create grid
         grid_x = np.linspace(bounded_x_min, bounded_x_max, grid_res)
         grid_y = np.linspace(bounded_y_min, bounded_y_max, grid_resY)
         xv, yv = np.meshgrid(grid_x, grid_y, indexing="xy")
-        
+
         # Fast KDTree-based interpolation
         points = np.column_stack((local_x[mask_bounds], local_y[mask_bounds]))
         tree = cKDTree(points)
-        
+
         # Query points
         query_points = np.column_stack((xv.ravel(), yv.ravel()))
-        
+
         # Find k nearest neighbors for smoother interpolation
         k = min(4, len(points))  # Use 4 neighbors or less if not enough points
-        distances, indices = tree.query(query_points, k=k, workers=-1) #-1 uses all cores
-        
+        distances, indices = tree.query(query_points, k=k, workers=-1)  # -1 uses all cores
+
         # Inverse distance weighting
         epsilon = 1e-10
         weights = 1.0 / (distances + epsilon)
         weights /= weights.sum(axis=1, keepdims=True)
-        
+
         # Interpolate values
         neighbor_values = values[mask_bounds][indices]
         grid_field = (neighbor_values * weights).sum(axis=1).reshape(grid_resY, grid_res)
-        
+
         # Plot
         if show_colorbar or show_axes:
             dpi = 300
@@ -704,25 +704,27 @@ def _to_slice_image_single_field(
                 origin="lower",
                 aspect="equal",
                 **kwargs,
-            )        
+            )
             if show_colorbar:
                 plt.colorbar()
             if not show_axes:
-                plt.axis('off')
+                plt.axis("off")
             plt.savefig(output_filename + ".png", dpi=dpi, bbox_inches="tight", pad_inches=0)
             plt.close()
         else:
             plt.imsave(output_filename + ".png", grid_field, cmap=cmap, origin="lower")
 
-    def to_line(self, 
-        output_filename, 
-        field_neon_dict, 
-        start_point, 
-        end_point, 
-        resolution, 
+    def to_line(
+        self,
+        output_filename,
+        field_neon_dict,
+        start_point,
+        end_point,
+        resolution,
         component=None,
         radius=1.0,
-        **kwargs,):
+        **kwargs,
+    ):
         """
         Extract field data along a line between start_point and end_point and save to a CSV file.
 
@@ -753,7 +755,7 @@ def to_line(self,
             The specific component of the field to extract (e.g., 0 for x-component, 1 for y-component).
             If None, the magnitude of the field is computed. Default is None.
         radius : int
-            The specified distance (in units of the coordinate system) to prefilter and query for line plot 
+            The specified distance (in units of the coordinate system) to prefilter and query for line plot
 
         Returns
         -------
@@ -764,12 +766,11 @@ def to_line(self,
         -----
         - The output CSV file will contain columns: 'x', 'y', 'z', and the value of the field name (e.g., 'velocity_x' or 'velocity_magnitude').
         """
-        
-    
+
         # Get the fields data from the NEON fields
         assert len(field_neon_dict.keys()) == 1, "Error: This function is designed to plot a single field at a time."
         fields_data = self.get_fields_data(field_neon_dict)
-        
+
         # Check if the component is within the valid range
         if component is None:
             print("\tCreating csv plot of the field magnitude!")
@@ -777,7 +778,7 @@ def to_line(self,
             squared = [comp**2 for comp in cell_data]
             cell_data = np.sqrt(sum(squared))
             field_name = list(fields_data.keys())[0].split("_")[0] + "_magnitude"
-            
+
         else:
             assert component < max(self.field_name_cardinality_dict.values()), (
                 f"Error: Component {component} is out of range for the provided fields."
@@ -785,10 +786,7 @@ def to_line(self,
             print(f"\tCreating csv plot for component {component} of the input field!")
             field_name = list(fields_data.keys())[component]
             cell_data = fields_data[field_name]
-        
-        if "velocity" in field_name.lower():
-            cell_data = cell_data * self.conversion
-    
+
         # Plot each field in the dictionary
         self._to_line_field(
             f"{output_filename}_{field_name}",
@@ -800,36 +798,36 @@ def to_line(self,
             **kwargs,
         )
         print(f"\tLine Plot for field {field_name} saved as {output_filename}.csv")
-    
+
     def _to_line_field(
-        self, 
-        output_filename, 
-        cell_data, 
-        start_point, 
-        end_point, 
+        self,
+        output_filename,
+        cell_data,
+        start_point,
+        end_point,
         resolution,
         radius,
         **kwargs,
-        ):
+    ):
         """
         Helper function to create a line plot for a single field.
         """
         import numpy as np
-        
-        #cell_points = self.coordinates[self.connectivity]  # Shape: (M, K, 3), where M is num cells, K is nodes per cell
-        #centroids = np.mean(cell_points, axis=1)  # Shape: (M, 3)
+
+        # cell_points = self.coordinates[self.connectivity]  # Shape: (M, K, 3), where M is num cells, K is nodes per cell
+        # centroids = np.mean(cell_points, axis=1)  # Shape: (M, 3)
         centroids = self.centroids
         p0 = np.array(start_point, dtype=np.float32)
         p1 = np.array(end_point, dtype=np.float32)
-        
+
         # direction and parameter t for each centroid
-        d = (p1 - p0)
+        d = p1 - p0
         L = np.linalg.norm(d)
         d_unit = d / L
         v = centroids - p0
         t = v.dot(d_unit)
         closest = p0 + np.outer(t, d_unit)
-        perp_dist = np.linalg.norm(centroids-closest, axis=1)
+        perp_dist = np.linalg.norm(centroids - closest, axis=1)
 
         # optionally mask to [0,L] or a small perp-radius
         mask = (t >= 0) & (t <= L) & (perp_dist <= radius)
@@ -847,15 +845,8 @@ def _to_line_field(
         vals_line = np.interp(t_line, t_sorted, data_sorted, left=np.nan, right=np.nan)
 
         # reconstruct (x,y,z)
-        line_xyz = p0[None,:] + t_line[:,None]*d_unit[None,:]
+        line_xyz = p0[None, :] + t_line[:, None] * d_unit[None, :]
 
         # vectorized CSV dump
-        out = np.hstack([line_xyz, vals_line[:,None]])
-        np.savetxt(
-            output_filename + '.csv',
-            out,
-            delimiter=',',
-            header='x,y,z,value',
-            comments=''
-        )
-
+        out = np.hstack([line_xyz, vals_line[:, None]])
+        np.savetxt(output_filename + ".csv", out, delimiter=",", header="x,y,z,value", comments="")

From daec1e34b2c1d61a05a5e00bf0857a320fde2ed2 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Thu, 21 Aug 2025 23:08:34 -0400
Subject: [PATCH 156/208] improved the implementation of boundary data aux
 encoding during initialization

---
 xlb/operator/boundary_condition/__init__.py   |   2 +-
 xlb/operator/boundary_condition/bc_zouhe.py   |  15 +-
 .../boundary_condition/boundary_condition.py  | 209 ---------------
 .../boundary_condition/helper_functions_bc.py | 253 +++++++++++++++++-
 xlb/operator/stepper/nse_multires_stepper.py  |  17 +-
 xlb/operator/stepper/nse_stepper.py           |  15 +-
 6 files changed, 290 insertions(+), 221 deletions(-)

diff --git a/xlb/operator/boundary_condition/__init__.py b/xlb/operator/boundary_condition/__init__.py
index e1889563..b3c814b5 100644
--- a/xlb/operator/boundary_condition/__init__.py
+++ b/xlb/operator/boundary_condition/__init__.py
@@ -1,4 +1,4 @@
-from xlb.operator.boundary_condition.helper_functions_bc import HelperFunctionsBC
+from xlb.operator.boundary_condition.helper_functions_bc import HelperFunctionsBC, EncodeInitialAuxiliaryData, MultiresEncodeInitialAuxiliaryData
 from xlb.operator.boundary_condition.boundary_condition import BoundaryCondition
 from xlb.operator.boundary_condition.boundary_condition_registry import BoundaryConditionRegistry
 from xlb.operator.boundary_condition.bc_equilibrium import EquilibriumBC
diff --git a/xlb/operator/boundary_condition/bc_zouhe.py b/xlb/operator/boundary_condition/bc_zouhe.py
index 73fc7068..4673521c 100644
--- a/xlb/operator/boundary_condition/bc_zouhe.py
+++ b/xlb/operator/boundary_condition/bc_zouhe.py
@@ -102,14 +102,17 @@ def __init__(
             self.prescribed_value = prescribed_value
             self.profile = self._create_constant_prescribed_profile()
 
-        # This BC needs auxiliary data initialization before streaming
-        self.needs_aux_init = True
+        if self.compute_backend == ComputeBackend.JAX:
+            self.prescribed_values = self.profile()
+        else:
+            # This BC needs auxiliary data initialization before streaming
+            self.needs_aux_init = True
 
-        # This BC needs auxiliary data recovery after streaming
-        self.needs_aux_recovery = True
+            # This BC needs auxiliary data recovery after streaming
+            self.needs_aux_recovery = True
 
-        # This BC needs one auxiliary data for the density or normal velocity
-        self.num_of_aux_data = 1
+            # This BC needs one auxiliary data for the density or normal velocity
+            self.num_of_aux_data = 1
 
         # This BC needs padding for finding missing directions when imposed on a geometry that is in the domain interior
         self.needs_padding = True
diff --git a/xlb/operator/boundary_condition/boundary_condition.py b/xlb/operator/boundary_condition/boundary_condition.py
index d99692c2..a949ef35 100644
--- a/xlb/operator/boundary_condition/boundary_condition.py
+++ b/xlb/operator/boundary_condition/boundary_condition.py
@@ -157,212 +157,3 @@ def kernel(
                 f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
 
         return kernel
-
-    def _construct_aux_data_init_kernel(self, functional):
-        """
-        Constructs the warp kernel for the auxiliary data recovery.
-        """
-        bc_helper = HelperFunctionsBC(velocity_set=self.velocity_set, precision_policy=self.precision_policy, compute_backend=self.compute_backend)
-
-        _id = wp.uint8(self.id)
-        _opp_indices = self.velocity_set.opp_indices
-        _num_of_aux_data = self.num_of_aux_data
-
-        # Construct the warp kernel
-        @wp.kernel
-        def aux_data_init_kernel(
-            f_1: wp.array4d(dtype=Any),
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.uint8),
-        ):
-            # Get the global index
-            i, j, k = wp.tid()
-            index = wp.vec3i(i, j, k)
-
-            # read tid data
-            _, _, _boundary_id, _missing_mask = bc_helper.get_bc_thread_data(f_1, f_1, bc_mask, missing_mask, index)
-
-            # Apply the functional
-            if _boundary_id == _id:
-                # prescribed_values is a q-sized vector of type wp.vec
-                prescribed_values = functional(index)
-                # Write the result for all q directions, but only store up to num_of_aux_data
-                # TODO: Somehow raise an error if the number of prescribed values does not match the number of missing directions
-
-                # The first BC auxiliary data is stored in the zero'th index of f_1 associated with its center.
-                f_1[0, index[0], index[1], index[2]] = self.store_dtype(prescribed_values[0])
-                counter = wp.int32(1)
-
-                # The other remaining BC auxiliary data are stored in missing directions of f_1.
-                for l in range(1, self.velocity_set.q):
-                    if _missing_mask[l] == wp.uint8(1) and counter < _num_of_aux_data:
-                        f_1[_opp_indices[l], index[0], index[1], index[2]] = self.store_dtype(prescribed_values[counter])
-                        counter += 1
-
-        return aux_data_init_kernel
-
-    def _construct_aux_data_init_container(self, functional):
-        """
-        Constructs the Neon container for encoding auxilary data recovery.
-        """
-
-        bc_helper = HelperFunctionsBC(velocity_set=self.velocity_set, precision_policy=self.precision_policy, compute_backend=self.compute_backend)
-
-        _id = wp.uint8(self.id)
-        _opp_indices = self.velocity_set.opp_indices
-        _num_of_aux_data = self.num_of_aux_data
-
-        # Find velocity index for (0, 0, 0)
-        lattice_central_index = self.velocity_set.center_index
-
-        # Construct the Neon container
-        @neon.Container.factory(name="EncodingAuxData_" + str(self.id))
-        def aux_data_init_container(
-            f_1: Any,
-            bc_mask: Any,
-            missing_mask: Any,
-        ):
-            def aux_data_init_ll(loader: neon.Loader):
-                loader.set_grid(f_1.get_grid())
-
-                f_1_pn = loader.get_write_handle(f_1)
-                bc_mask_pn = loader.get_read_handle(bc_mask)
-                missing_mask_pn = loader.get_read_handle(missing_mask)
-
-                @wp.func
-                def aux_data_init_cl(index: Any):
-                    # read tid data
-                    _, _, _boundary_id, _missing_mask = bc_helper.neon_get_bc_thread_data(f_1_pn, f_1_pn, bc_mask_pn, missing_mask_pn, index)
-
-                    # Apply the functional
-                    if _boundary_id == _id:
-                        # prescribed_values is a q-sized vector of type wp.vec
-                        warp_index = wp.vec3i()
-                        gloabl_index = wp.neon_global_idx(f_1_pn, index)
-                        warp_index[0] = wp.neon_get_x(gloabl_index)
-                        warp_index[1] = wp.neon_get_y(gloabl_index)
-                        warp_index[2] = wp.neon_get_z(gloabl_index)
-                        prescribed_values = functional(warp_index)
-
-                    # Write the result for all q directions, but only store up to num_of_aux_data
-                    counter = wp.int32(0)
-                    for l in range(self.velocity_set.q):
-                        # wp.neon_write(f_pn, index, l, self.store_dtype(feq[l]))
-                        if l == lattice_central_index:
-                            # The first BC auxiliary data is stored in the zero'th index of f_1 associated with its center.
-                            wp.neon_write(f_1_pn, index, l, self.store_dtype(prescribed_values[l]))
-                            counter += 1
-                        elif _missing_mask[l] == wp.uint8(1):
-                            # The other remaining BC auxiliary data are stored in missing directions of f_1.
-                            # Only store up to num_of_aux_data
-                            wp.neon_write(f_1_pn, index, _opp_indices[l], self.store_dtype(prescribed_values[l]))
-                            counter += 1
-                        if counter > _num_of_aux_data:
-                            # Only store up to num_of_aux_data
-                            return
-
-                # Declare the kernel in the Neon loader
-                loader.declare_kernel(aux_data_init_cl)
-
-            return aux_data_init_ll
-
-        return aux_data_init_container
-
-    # Initialize auxiliary data for the boundary condition.
-    def aux_data_init(self, f_1, bc_mask, missing_mask):
-        if self.compute_backend == ComputeBackend.WARP:
-            # Launch the warp kernel
-            wp.launch(
-                self._construct_aux_data_init_kernel(self.profile),
-                inputs=[f_1, bc_mask, missing_mask],
-                dim=f_1.shape[1:],
-            )
-        elif self.compute_backend == ComputeBackend.JAX:
-            # We don't use boundary aux encoding/decoding in JAX
-            self.prescribed_values = self.profile()
-        elif self.compute_backend == ComputeBackend.NEON:
-            c = self._construct_aux_data_init_container(self.profile)(f_1, bc_mask, missing_mask)
-            c.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
-        self.is_initialized_with_aux_data = True
-        return f_1
-
-    def _construct_multires_aux_data_init_container(self, functional):
-        """
-        Constructs the Neon container for encoding auxilary data recovery.
-        """
-
-        bc_helper = HelperFunctionsBC(velocity_set=self.velocity_set, precision_policy=self.precision_policy, compute_backend=self.compute_backend)
-
-        _id = wp.uint8(self.id)
-        _opp_indices = self.velocity_set.opp_indices
-        _num_of_aux_data = self.num_of_aux_data
-
-        # Find velocity index for (0, 0, 0)
-        lattice_central_index = self.velocity_set.center_index
-
-        # Construct the Neon container
-        @neon.Container.factory(name="MultiresEncodingAuxData_" + str(self.id))
-        def aux_data_init_container(
-            f_1: Any,
-            bc_mask: Any,
-            missing_mask: Any,
-            level: Any,
-        ):
-            def aux_data_init_ll(loader: neon.Loader):
-                loader.set_mres_grid(f_1.get_grid(), level)
-
-                f_1_pn = loader.get_mres_write_handle(f_1)
-                bc_mask_pn = loader.get_mres_read_handle(bc_mask)
-                missing_mask_pn = loader.get_mres_read_handle(missing_mask)
-
-                # Get the refinement factor for the current level
-                refinement = 2**level
-
-                @wp.func
-                def aux_data_init_cl(index: Any):
-                    # read tid data
-                    _, _, _boundary_id, _missing_mask = bc_helper.neon_get_bc_thread_data(f_1_pn, f_1_pn, bc_mask_pn, missing_mask_pn, index)
-
-                    # Apply the functional
-                    if _boundary_id == _id:
-                        # prescribed_values is a q-sized vector of type wp.vec
-                        warp_index = wp.vec3i()
-                        gloabl_index = wp.neon_global_idx(f_1_pn, index)
-                        warp_index[0] = wp.neon_get_x(gloabl_index) // refinement
-                        warp_index[1] = wp.neon_get_y(gloabl_index) // refinement
-                        warp_index[2] = wp.neon_get_z(gloabl_index) // refinement
-                        prescribed_values = functional(warp_index)
-
-                    # Write the result for all q directions, but only store up to num_of_aux_data
-                    counter = wp.int32(0)
-                    for l in range(self.velocity_set.q):
-                        # wp.neon_write(f_pn, index, l, self.store_dtype(feq[l]))
-                        if l == lattice_central_index:
-                            # The first BC auxiliary data is stored in the zero'th index of f_1 associated with its center.
-                            wp.neon_write(f_1_pn, index, l, self.store_dtype(prescribed_values[l]))
-                            counter += 1
-                        elif _missing_mask[l] == wp.uint8(1):
-                            # The other remaining BC auxiliary data are stored in missing directions of f_1.
-                            # Only store up to num_of_aux_data
-                            wp.neon_write(f_1_pn, index, _opp_indices[l], self.store_dtype(prescribed_values[l]))
-                            counter += 1
-                        if counter > _num_of_aux_data:
-                            # Only store up to num_of_aux_data
-                            return
-
-                # Declare the kernel in the Neon loader
-                loader.declare_kernel(aux_data_init_cl)
-
-            return aux_data_init_ll
-
-        return aux_data_init_container
-
-    # Initialize auxiliary data for the boundary condition.
-    def multires_aux_data_init(self, f_1, bc_mask, missing_mask, level, stream):
-        if self.compute_backend in [ComputeBackend.JAX, ComputeBackend.WARP]:
-            raise NotImplementedError(f"Operator {self.__class__.__name__} not supported in {self.compute_backend} backend.")
-        if self.compute_backend == ComputeBackend.NEON:
-            c = self._construct_multires_aux_data_init_container(self.profile)(f_1, bc_mask, missing_mask, level)
-            c.run(stream, container_runtime=neon.Container.ContainerRuntime.neon)
-        self.is_initialized_with_aux_data = True
-        return f_1
diff --git a/xlb/operator/boundary_condition/helper_functions_bc.py b/xlb/operator/boundary_condition/helper_functions_bc.py
index e8741159..f030fcd5 100644
--- a/xlb/operator/boundary_condition/helper_functions_bc.py
+++ b/xlb/operator/boundary_condition/helper_functions_bc.py
@@ -1,6 +1,13 @@
+import inspect
+from typing import Any, Callable
+
 import warp as wp
-from typing import Any
+import neon
+
+from xlb.velocity_set.velocity_set import VelocitySet
+from xlb.precision_policy import PrecisionPolicy
 from xlb import DefaultConfig, ComputeBackend
+from xlb.operator.operator import Operator
 from xlb.operator.macroscopic import SecondMoment as MomentumFlux
 from xlb.operator.macroscopic import Macroscopic
 from xlb.operator.equilibrium import QuadraticEquilibrium
@@ -317,3 +324,247 @@ def interpolated_nonequilibrium_bounceback(
         self.interpolated_bounceback = interpolated_bounceback
         self.interpolated_nonequilibrium_bounceback = interpolated_nonequilibrium_bounceback
         self.neon_get_bc_thread_data = neon_get_bc_thread_data
+
+
+class EncodeInitialAuxiliaryData(Operator):
+    """
+    Operator for encoding boundary auxiliary data during initialization.
+    """
+
+    def __init__(
+        self,
+        boundary_id: int,
+        num_of_aux_data: int,
+        user_defined_functional: Callable,
+        velocity_set: VelocitySet = None,
+        precision_policy: PrecisionPolicy = None,
+        compute_backend: ComputeBackend = None,
+    ):
+        self.user_defined_functional = user_defined_functional
+        self.boundary_id = wp.uint8(boundary_id)
+        self.num_of_aux_data = num_of_aux_data
+
+        super().__init__(velocity_set, precision_policy, compute_backend)
+
+        # Inspect the signature of the user-defined functional.
+        # We assume the profile function takes only the index as input and is hence time-independent.
+        sig = inspect.signature(user_defined_functional)
+        assert self.compute_backend != ComputeBackend.JAX, "Encoding/decoding of auxiliary data are not required for boundary conditions in JAX"
+        assert len(sig.parameters) == 1, "User-defined functional must take exactly one argument (the index)."
+
+        # Define a HelperFunctionsBC instance
+        self.bc_helper = HelperFunctionsBC(
+            velocity_set=self.velocity_set,
+            precision_policy=self.precision_policy,
+            compute_backend=self.compute_backend,
+        )
+
+        # TODO: Somehow raise an error if the number of prescribed values does not match the number of missing directions
+
+    def _construct_warp(self):
+        """
+        Constructs the warp kernel for the auxiliary data recovery.
+        """
+        # Find velocity index for (0, 0, 0)
+        lattice_central_index = self.velocity_set.center_index
+        _opp_indices = self.velocity_set.opp_indices
+        _id = self.boundary_id
+        _num_of_aux_data = self.num_of_aux_data
+
+        @wp.func
+        def functional(
+            index: Any,
+            _missing_mask: Any,
+            field_storage: Any,
+            prescribed_values: Any,
+        ):
+            # Write the result for all q directions, but only store up to num_of_aux_data
+            counter = wp.int32(0)
+            for l in range(self.velocity_set.q):
+                # wp.neon_write(f_pn, index, l, self.store_dtype(feq[l]))
+                if l == lattice_central_index:
+                    # The first BC auxiliary data is stored in the zero'th index of f_1 associated with its center.
+                    self.write_field(field_storage, index, l, self.store_dtype(prescribed_values[l]))
+                    counter += 1
+                elif _missing_mask[l] == wp.uint8(1):
+                    # The other remaining BC auxiliary data are stored in missing directions of f_1.
+                    # Only store up to num_of_aux_data
+                    self.write_field(field_storage, index, _opp_indices[l], self.store_dtype(prescribed_values[l]))
+                    counter += 1
+                if counter > _num_of_aux_data:
+                    # Only store up to num_of_aux_data
+                    return
+
+        # Construct the warp kernel
+        @wp.kernel
+        def kernel(
+            f_1: wp.array4d(dtype=Any),
+            bc_mask: wp.array4d(dtype=wp.uint8),
+            missing_mask: wp.array4d(dtype=wp.uint8),
+        ):
+            # Get the global index
+            i, j, k = wp.tid()
+            index = wp.vec3i(i, j, k)
+
+            # read tid data
+            _, _, _boundary_id, _missing_mask = self.bc_helper.get_bc_thread_data(f_1, f_1, bc_mask, missing_mask, index)
+
+            # Apply the functional
+            # change this to use central location
+            if _boundary_id == _id:
+                # prescribed_values is a q-sized vector of type wp.vec
+                prescribed_values = self.user_defined_functional(index)
+
+                # call the functional
+                functional(index, _missing_mask, f_1, prescribed_values)
+
+        return functional, kernel
+
+    def _construct_neon(self, functional):
+        """
+        Constructs the Neon container for encoding auxilary data recovery.
+        """
+        # Use the warp functional for the Neon backend
+        functional, _ = self._construct_warp()
+        _id = self.boundary_id
+
+        # Construct the Neon container
+        @neon.Container.factory(name="EncodingAuxData_" + str(_id))
+        def aux_data_init_container(
+            f_1: Any,
+            bc_mask: Any,
+            missing_mask: Any,
+        ):
+            def aux_data_init_ll(loader: neon.Loader):
+                loader.set_grid(f_1.get_grid())
+
+                f_1_pn = loader.get_write_handle(f_1)
+                bc_mask_pn = loader.get_read_handle(bc_mask)
+                missing_mask_pn = loader.get_read_handle(missing_mask)
+
+                @wp.func
+                def aux_data_init_cl(index: Any):
+                    # read tid data
+                    _, _, _boundary_id, _missing_mask = self.bc_helper.neon_get_bc_thread_data(f_1_pn, f_1_pn, bc_mask_pn, missing_mask_pn, index)
+
+                    # Apply the functional
+                    if _boundary_id == _id:
+                        # prescribed_values is a q-sized vector of type wp.vec
+                        warp_index = wp.vec3i()
+                        gloabl_index = wp.neon_global_idx(f_1_pn, index)
+                        warp_index[0] = wp.neon_get_x(gloabl_index)
+                        warp_index[1] = wp.neon_get_y(gloabl_index)
+                        warp_index[2] = wp.neon_get_z(gloabl_index)
+                        prescribed_values = self.user_defined_functional(warp_index)
+
+                        # Call the functional
+                        functional(index, _missing_mask, f_1_pn, prescribed_values)
+
+                # Declare the kernel in the Neon loader
+                loader.declare_kernel(aux_data_init_cl)
+
+            return aux_data_init_ll
+
+        return aux_data_init_container
+
+    # Initialize auxiliary data for the boundary condition.
+    @Operator.register_backend(ComputeBackend.WARP)
+    def warp_implementation(self, f_1, bc_mask, missing_mask):
+        if self.compute_backend == ComputeBackend.WARP:
+            # Launch the warp kernel
+            wp.launch(
+                self.warp_kernel,
+                inputs=[f_1, bc_mask, missing_mask],
+                dim=f_1.shape[1:],
+            )
+
+        elif self.compute_backend == ComputeBackend.NEON:
+            c = self.neon_container(f_1, bc_mask, missing_mask)
+            c.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
+        return f_1
+
+
+class MultiresEncodeInitialAuxiliaryData(EncodeInitialAuxiliaryData):
+    """
+    Operator for encoding boundary auxiliary data during initialization.
+    """
+
+    def __init__(
+        self,
+        boundary_id: int,
+        num_of_aux_data: int,
+        user_defined_functional: Callable,
+        velocity_set: VelocitySet = None,
+        precision_policy: PrecisionPolicy = None,
+        compute_backend: ComputeBackend = None,
+    ):
+        super().__init__(
+            boundary_id=boundary_id,
+            num_of_aux_data=num_of_aux_data,
+            user_defined_functional=user_defined_functional,
+            velocity_set=velocity_set,
+            precision_policy=precision_policy,
+            compute_backend=compute_backend,
+        )
+
+        assert self.compute_backend == ComputeBackend.Neon, f"Operator {self.__class__.__name__} not supported in {self.compute_backend} backend."
+
+    def _construct_neon(self, functional):
+        """
+        Constructs the Neon container for encoding auxilary data recovery.
+        """
+
+        # Borrow the functional from the warp implementation
+        functional, _ = self._construct_warp()
+        _id = self.boundary_id
+
+        # Construct the Neon container
+        @neon.Container.factory(name="MultiresEncodingAuxData_" + str(_id))
+        def aux_data_init_container(
+            f_1: Any,
+            bc_mask: Any,
+            missing_mask: Any,
+            level: Any,
+        ):
+            def aux_data_init_ll(loader: neon.Loader):
+                loader.set_mres_grid(f_1.get_grid(), level)
+
+                f_1_pn = loader.get_mres_write_handle(f_1)
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask)
+
+                # Get the refinement factor for the current level
+                refinement = 2**level
+
+                @wp.func
+                def aux_data_init_cl(index: Any):
+                    # read tid data
+                    _, _, _boundary_id, _missing_mask = self.bc_helper.neon_get_bc_thread_data(f_1_pn, f_1_pn, bc_mask_pn, missing_mask_pn, index)
+
+                    # Apply the functional
+                    if _boundary_id == _id:
+                        # prescribed_values is a q-sized vector of type wp.vec
+                        warp_index = wp.vec3i()
+                        gloabl_index = wp.neon_global_idx(f_1_pn, index)
+                        warp_index[0] = wp.neon_get_x(gloabl_index) // refinement
+                        warp_index[1] = wp.neon_get_y(gloabl_index) // refinement
+                        warp_index[2] = wp.neon_get_z(gloabl_index) // refinement
+                        prescribed_values = self.user_defined_functional(warp_index)
+
+                        # Call the functional
+                        functional(index, _missing_mask, f_1_pn, prescribed_values)
+
+                # Declare the kernel in the Neon loader
+                loader.declare_kernel(aux_data_init_cl)
+
+            return aux_data_init_ll
+
+        return aux_data_init_container
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(self, f_1, bc_mask, missing_mask, stream):
+        grid = bc_mask.get_grid()
+        for level in range(grid.num_levels):
+            c = self._construct_multires_aux_data_init_container(self.profile)(f_1, bc_mask, missing_mask, level)
+            c.run(stream, container_runtime=neon.Container.ContainerRuntime.neon)
+        return f_1
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index aef8e0db..d7c559e3 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -18,6 +18,7 @@
 from xlb.operator.collision import ForcedCollision
 from xlb.helper import check_bc_overlaps
 from xlb.operator.boundary_masker import MeshVoxelizationMethod, MultiresMeshMaskerAABB, MultiresIndicesBoundaryMasker
+from xlb.operator.boundary_condition.helper_functions_bc import MultiresEncodeInitialAuxiliaryData
 
 
 class MultiresIncompressibleNavierStokesStepper(Stepper):
@@ -247,9 +248,19 @@ def _initialize_auxiliary_data(boundary_conditions, f_1, bc_mask, missing_mask):
         """Initialize auxiliary data for boundary conditions that require it."""
         for bc in boundary_conditions:
             if bc.needs_aux_init and not bc.is_initialized_with_aux_data:
-                for level in range(bc_mask.get_grid().get_num_levels()):
-                    # Initialize auxiliary data for each level
-                    f_1 = bc.multires_aux_data_init(f_1, bc_mask, missing_mask, level=level, stream=0)
+                # Create the encoder operator for storing the auxiliary data
+                encode_auxiliary_data = MultiresEncodeInitialAuxiliaryData(
+                    bc.id,
+                    bc.num_of_aux_data,
+                    bc.profile,
+                    velocity_set=bc.velocity_set,
+                    precision_policy=bc.precision_policy,
+                    compute_backend=bc.compute_backend,
+                )
+
+                # Store the auxiliary data in f_1
+                f_1 = encode_auxiliary_data(f_1, bc_mask, missing_mask, stream=0)
+                bc.is_initialized_with_aux_data = True
         return f_1
 
     def _construct_neon(self):
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 3adb1d00..56eb5043 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -29,6 +29,7 @@
 )
 from xlb.helper import check_bc_overlaps
 from xlb.helper.nse_fields import create_nse_fields
+from xlb.operator.boundary_condition.helper_functions_bc import EncodeInitialAuxiliaryData
 
 
 class IncompressibleNavierStokesStepper(Stepper):
@@ -168,7 +169,19 @@ def _initialize_auxiliary_data(boundary_conditions, f_1, bc_mask, missing_mask):
         """Initialize auxiliary data for boundary conditions that require it."""
         for bc in boundary_conditions:
             if bc.needs_aux_init and not bc.is_initialized_with_aux_data:
-                f_1 = bc.aux_data_init(f_1, bc_mask, missing_mask)
+                # Create the encoder operator for storing the auxiliary data
+                encode_auxiliary_data = EncodeInitialAuxiliaryData(
+                    bc.id,
+                    bc.num_of_aux_data,
+                    bc.profile,
+                    velocity_set=bc.velocity_set,
+                    precision_policy=bc.precision_policy,
+                    compute_backend=bc.compute_backend,
+                )
+
+                # Store the auxiliary data in f_1
+                f_1 = encode_auxiliary_data(f_1, bc_mask, missing_mask)
+                bc.is_initialized_with_aux_data = True
         return f_1
 
     @Operator.register_backend(ComputeBackend.JAX)

From 28cd20adc7a05625ff736ec3d1b01a6e06e390e1 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Thu, 21 Aug 2025 23:23:20 -0400
Subject: [PATCH 157/208] new aux encoder works with multires

---
 xlb/operator/boundary_condition/helper_functions_bc.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/xlb/operator/boundary_condition/helper_functions_bc.py b/xlb/operator/boundary_condition/helper_functions_bc.py
index f030fcd5..71f132f9 100644
--- a/xlb/operator/boundary_condition/helper_functions_bc.py
+++ b/xlb/operator/boundary_condition/helper_functions_bc.py
@@ -507,9 +507,9 @@ def __init__(
             compute_backend=compute_backend,
         )
 
-        assert self.compute_backend == ComputeBackend.Neon, f"Operator {self.__class__.__name__} not supported in {self.compute_backend} backend."
+        assert self.compute_backend == ComputeBackend.NEON, f"Operator {self.__class__.__name__} not supported in {self.compute_backend} backend."
 
-    def _construct_neon(self, functional):
+    def _construct_neon(self):
         """
         Constructs the Neon container for encoding auxilary data recovery.
         """
@@ -559,12 +559,12 @@ def aux_data_init_cl(index: Any):
 
             return aux_data_init_ll
 
-        return aux_data_init_container
+        return functional, aux_data_init_container
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, f_1, bc_mask, missing_mask, stream):
         grid = bc_mask.get_grid()
         for level in range(grid.num_levels):
-            c = self._construct_multires_aux_data_init_container(self.profile)(f_1, bc_mask, missing_mask, level)
+            c = self.neon_container(f_1, bc_mask, missing_mask, level)
             c.run(stream, container_runtime=neon.Container.ContainerRuntime.neon)
         return f_1

From a540615005b8aa7f8d4c6399fa6e701566869db0 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Thu, 21 Aug 2025 23:37:27 -0400
Subject: [PATCH 158/208] added neon_index_to_warp to the bc helper

---
 .../boundary_condition/helper_functions_bc.py | 32 ++++++++++++-------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/xlb/operator/boundary_condition/helper_functions_bc.py b/xlb/operator/boundary_condition/helper_functions_bc.py
index 71f132f9..ef5c85e2 100644
--- a/xlb/operator/boundary_condition/helper_functions_bc.py
+++ b/xlb/operator/boundary_condition/helper_functions_bc.py
@@ -314,6 +314,22 @@ def interpolated_nonequilibrium_bounceback(
 
             return f_post
 
+        @wp.func
+        def neon_index_to_warp(neon_field_hdl: Any, index: Any):
+            # Unpack the global index in Neon
+            cIdx = wp.neon_global_idx(neon_field_hdl, index)
+            gx = wp.neon_get_x(cIdx)
+            gy = wp.neon_get_y(cIdx)
+            gz = wp.neon_get_z(cIdx)
+
+            # TODO@Max - XLB is flattening the z dimension in 3D, while neon uses the y dimension
+            if _d == 2:
+                gy, gz = gz, gy
+
+            # Get warp indices
+            index_wp = wp.vec3i(gx, gy, gz)
+            return index_wp
+
         self.get_bc_thread_data = get_bc_thread_data
         self.get_bc_fsum = get_bc_fsum
         self.get_normal_vectors = get_normal_vectors
@@ -324,6 +340,7 @@ def interpolated_nonequilibrium_bounceback(
         self.interpolated_bounceback = interpolated_bounceback
         self.interpolated_nonequilibrium_bounceback = interpolated_nonequilibrium_bounceback
         self.neon_get_bc_thread_data = neon_get_bc_thread_data
+        self.neon_index_to_warp = neon_index_to_warp
 
 
 class EncodeInitialAuxiliaryData(Operator):
@@ -449,12 +466,7 @@ def aux_data_init_cl(index: Any):
 
                     # Apply the functional
                     if _boundary_id == _id:
-                        # prescribed_values is a q-sized vector of type wp.vec
-                        warp_index = wp.vec3i()
-                        gloabl_index = wp.neon_global_idx(f_1_pn, index)
-                        warp_index[0] = wp.neon_get_x(gloabl_index)
-                        warp_index[1] = wp.neon_get_y(gloabl_index)
-                        warp_index[2] = wp.neon_get_z(gloabl_index)
+                        warp_index = self.bc_helper.neon_index_to_warp(f_1_pn, index)
                         prescribed_values = self.user_defined_functional(warp_index)
 
                         # Call the functional
@@ -543,12 +555,8 @@ def aux_data_init_cl(index: Any):
 
                     # Apply the functional
                     if _boundary_id == _id:
-                        # prescribed_values is a q-sized vector of type wp.vec
-                        warp_index = wp.vec3i()
-                        gloabl_index = wp.neon_global_idx(f_1_pn, index)
-                        warp_index[0] = wp.neon_get_x(gloabl_index) // refinement
-                        warp_index[1] = wp.neon_get_y(gloabl_index) // refinement
-                        warp_index[2] = wp.neon_get_z(gloabl_index) // refinement
+                        warp_index = self.bc_helper.neon_index_to_warp(f_1_pn, index)
+                        warp_index /= refinement
                         prescribed_values = self.user_defined_functional(warp_index)
 
                         # Call the functional

From fc094dcc5d90e98300d91dfb4ad4449a90fefdb6 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 22 Aug 2025 10:12:58 -0400
Subject: [PATCH 159/208] fixed a bug

---
 .../boundary_condition/helper_functions_bc.py | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/xlb/operator/boundary_condition/helper_functions_bc.py b/xlb/operator/boundary_condition/helper_functions_bc.py
index ef5c85e2..ba8e4672 100644
--- a/xlb/operator/boundary_condition/helper_functions_bc.py
+++ b/xlb/operator/boundary_condition/helper_functions_bc.py
@@ -479,20 +479,20 @@ def aux_data_init_cl(index: Any):
 
         return aux_data_init_container
 
-    # Initialize auxiliary data for the boundary condition.
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(self, f_1, bc_mask, missing_mask):
-        if self.compute_backend == ComputeBackend.WARP:
-            # Launch the warp kernel
-            wp.launch(
-                self.warp_kernel,
-                inputs=[f_1, bc_mask, missing_mask],
-                dim=f_1.shape[1:],
-            )
-
-        elif self.compute_backend == ComputeBackend.NEON:
-            c = self.neon_container(f_1, bc_mask, missing_mask)
-            c.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
+        # Launch the warp kernel
+        wp.launch(
+            self.warp_kernel,
+            inputs=[f_1, bc_mask, missing_mask],
+            dim=f_1.shape[1:],
+        )
+        return f_1
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(self, f_1, bc_mask, missing_mask):
+        c = self.neon_container(f_1, bc_mask, missing_mask)
+        c.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
         return f_1
 
 

From 8556361943ef536725aabc59c43af02b3e7d2a1f Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 22 Aug 2025 14:19:11 -0400
Subject: [PATCH 160/208] Moved ZouHe/Regularized decoding function to the new
 EncodeAuxiliaryData helper class.

---
 xlb/operator/boundary_condition/bc_zouhe.py   | 23 +++----
 .../boundary_condition/helper_functions_bc.py | 65 +++++++++++++++----
 xlb/operator/stepper/nse_multires_stepper.py  |  9 ++-
 xlb/operator/stepper/nse_stepper.py           |  9 ++-
 4 files changed, 72 insertions(+), 34 deletions(-)

diff --git a/xlb/operator/boundary_condition/bc_zouhe.py b/xlb/operator/boundary_condition/bc_zouhe.py
index 4673521c..6883afeb 100644
--- a/xlb/operator/boundary_condition/bc_zouhe.py
+++ b/xlb/operator/boundary_condition/bc_zouhe.py
@@ -114,6 +114,9 @@ def __init__(
             # This BC needs one auxiliary data for the density or normal velocity
             self.num_of_aux_data = 1
 
+            # A placeholder for encoder-decoder object
+            self.encode_auxiliary_data = None
+
         # This BC needs padding for finding missing directions when imposed on a geometry that is in the domain interior
         self.needs_padding = True
 
@@ -279,9 +282,11 @@ def _construct_warp(self):
         # load helper functions. Always use warp backend for helper functions as it may also be called by the Neon backend.
         bc_helper = HelperFunctionsBC(velocity_set=self.velocity_set, precision_policy=self.precision_policy, compute_backend=ComputeBackend.WARP)
 
+        # get decoder functional
+        decoder_functional = self.encode_auxiliary_data.warp_functional["decode"]
+
         # Set local constants
         _d = self.velocity_set.d
-        lattice_central_index = self.velocity_set.center_index
 
         @wp.func
         def functional_velocity(
@@ -306,7 +311,7 @@ def functional_velocity(
             # Find the value of u from the missing directions
             # Since we are only considering normal velocity, we only need to find one value (stored at the center of f_1)
             # Create velocity vector by multiplying the prescribed value with the normal vector
-            prescribed_value = decode_lattice_center_value(index, f_1)
+            prescribed_value = decoder_functional(index, _missing_mask, f_1)
             _u = -prescribed_value * normals
 
             for d in range(_d):
@@ -337,7 +342,7 @@ def functional_pressure(
 
             # Find the value of rho from the missing directions
             # Since we need only one scalar value, we only need to find one value (stored at the center of f_1)
-            _rho = decode_lattice_center_value(index, f_1)
+            _rho = decoder_functional(index, _missing_mask, f_1)
 
             # calculate velocity
             fsum = bc_helper.get_bc_fsum(_f, _missing_mask)
@@ -349,18 +354,6 @@ def functional_pressure(
             _f = bc_helper.bounceback_nonequilibrium(_f, feq, _missing_mask)
             return _f
 
-        @wp.func
-        def decode_lattice_center_value(index: Any, f_1: Any):
-            """
-            Decode the encoded values needed for the boundary condition treatment from the center location in f_1.
-            """
-            if wp.static(self.compute_backend == ComputeBackend.WARP):
-                value = f_1[lattice_central_index, index[0], index[1], index[2]]
-            else:
-                # Note: in Neon case, f_1 is a pointer to the field not the actual data.
-                value = wp.neon_read(f_1, index, lattice_central_index)
-            return self.compute_dtype(value)
-
         if self.bc_type == "velocity":
             functional = functional_velocity
         elif self.bc_type == "pressure":
diff --git a/xlb/operator/boundary_condition/helper_functions_bc.py b/xlb/operator/boundary_condition/helper_functions_bc.py
index ba8e4672..ddd73ee5 100644
--- a/xlb/operator/boundary_condition/helper_functions_bc.py
+++ b/xlb/operator/boundary_condition/helper_functions_bc.py
@@ -343,7 +343,7 @@ def neon_index_to_warp(neon_field_hdl: Any, index: Any):
         self.neon_index_to_warp = neon_index_to_warp
 
 
-class EncodeInitialAuxiliaryData(Operator):
+class EncodeAuxiliaryData(Operator):
     """
     Operator for encoding boundary auxiliary data during initialization.
     """
@@ -389,12 +389,16 @@ def _construct_warp(self):
         _num_of_aux_data = self.num_of_aux_data
 
         @wp.func
-        def functional(
+        def encoder_functional(
             index: Any,
             _missing_mask: Any,
             field_storage: Any,
             prescribed_values: Any,
         ):
+            if len(prescribed_values) != _num_of_aux_data:
+                wp.printf("Error: User-defined profile must return a vector of size %d\n", _num_of_aux_data)
+                return
+
             # Write the result for all q directions, but only store up to num_of_aux_data
             counter = wp.int32(0)
             for l in range(self.velocity_set.q):
@@ -403,7 +407,7 @@ def functional(
                     # The first BC auxiliary data is stored in the zero'th index of f_1 associated with its center.
                     self.write_field(field_storage, index, l, self.store_dtype(prescribed_values[l]))
                     counter += 1
-                elif _missing_mask[l] == wp.uint8(1):
+                elif _missing_mask[l] == wp.uint8(1) and counter <= _num_of_aux_data:
                     # The other remaining BC auxiliary data are stored in missing directions of f_1.
                     # Only store up to num_of_aux_data
                     self.write_field(field_storage, index, _opp_indices[l], self.store_dtype(prescribed_values[l]))
@@ -412,6 +416,38 @@ def functional(
                     # Only store up to num_of_aux_data
                     return
 
+        @wp.func
+        def decoder_functional(
+            index: Any,
+            _missing_mask: Any,
+            field_storage: Any,
+        ):
+            """
+            Decode the encoded values needed for the boundary condition treatment from the center location in field_storage.
+            """
+
+            # Define a vector to hold prescribed_values
+            prescribed_values = wp.vec(_num_of_aux_data, dtype=self.compute_dtype)
+
+            # Read all q directions, but only retrieve up to num_of_aux_data
+            counter = wp.int32(0)
+            for l in range(self.velocity_set.q):
+                # wp.neon_write(f_pn, index, l, self.store_dtype(feq[l]))
+                if l == lattice_central_index:
+                    # The first BC auxiliary data is stored in the zero'th index of f_1 associated with its center.
+                    value = self.read_field(field_storage, index, l)
+                    prescribed_values[counter] = self.compute_dtype(value)
+                    counter += 1
+                elif _missing_mask[l] == wp.uint8(1) and counter <= _num_of_aux_data:
+                    # The other remaining BC auxiliary data are stored in missing directions of f_1.
+                    # Only store up to num_of_aux_data
+                    value = self.read_field(field_storage, index, _opp_indices[l])
+                    prescribed_values[counter] = self.compute_dtype(value)
+                    counter += 1
+                if counter > _num_of_aux_data:
+                    # Only retrieve up to num_of_aux_data
+                    return prescribed_values
+
         # Construct the warp kernel
         @wp.kernel
         def kernel(
@@ -433,16 +469,18 @@ def kernel(
                 prescribed_values = self.user_defined_functional(index)
 
                 # call the functional
-                functional(index, _missing_mask, f_1, prescribed_values)
+                encoder_functional(index, _missing_mask, f_1, prescribed_values)
 
-        return functional, kernel
+        functional_dict = {"encoder": encoder_functional, "decoder": decoder_functional}
+        return functional_dict, kernel
 
-    def _construct_neon(self, functional):
+    def _construct_neon(self):
         """
         Constructs the Neon container for encoding auxilary data recovery.
         """
         # Use the warp functional for the Neon backend
-        functional, _ = self._construct_warp()
+        functional_dict, _ = self._construct_warp()
+        encoder_functional = functional_dict["encoder"]
         _id = self.boundary_id
 
         # Construct the Neon container
@@ -470,14 +508,14 @@ def aux_data_init_cl(index: Any):
                         prescribed_values = self.user_defined_functional(warp_index)
 
                         # Call the functional
-                        functional(index, _missing_mask, f_1_pn, prescribed_values)
+                        encoder_functional(index, _missing_mask, f_1_pn, prescribed_values)
 
                 # Declare the kernel in the Neon loader
                 loader.declare_kernel(aux_data_init_cl)
 
             return aux_data_init_ll
 
-        return aux_data_init_container
+        return functional_dict, aux_data_init_container
 
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(self, f_1, bc_mask, missing_mask):
@@ -496,7 +534,7 @@ def neon_implementation(self, f_1, bc_mask, missing_mask):
         return f_1
 
 
-class MultiresEncodeInitialAuxiliaryData(EncodeInitialAuxiliaryData):
+class MultiresEncodeAuxiliaryData(EncodeAuxiliaryData):
     """
     Operator for encoding boundary auxiliary data during initialization.
     """
@@ -527,7 +565,8 @@ def _construct_neon(self):
         """
 
         # Borrow the functional from the warp implementation
-        functional, _ = self._construct_warp()
+        functional_dict, _ = self._construct_warp()
+        encoder_functional = functional_dict["encoder"]
         _id = self.boundary_id
 
         # Construct the Neon container
@@ -560,14 +599,14 @@ def aux_data_init_cl(index: Any):
                         prescribed_values = self.user_defined_functional(warp_index)
 
                         # Call the functional
-                        functional(index, _missing_mask, f_1_pn, prescribed_values)
+                        encoder_functional(index, _missing_mask, f_1_pn, prescribed_values)
 
                 # Declare the kernel in the Neon loader
                 loader.declare_kernel(aux_data_init_cl)
 
             return aux_data_init_ll
 
-        return functional, aux_data_init_container
+        return functional_dict, aux_data_init_container
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, f_1, bc_mask, missing_mask, stream):
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index d7c559e3..7f6eb7c2 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -18,7 +18,7 @@
 from xlb.operator.collision import ForcedCollision
 from xlb.helper import check_bc_overlaps
 from xlb.operator.boundary_masker import MeshVoxelizationMethod, MultiresMeshMaskerAABB, MultiresIndicesBoundaryMasker
-from xlb.operator.boundary_condition.helper_functions_bc import MultiresEncodeInitialAuxiliaryData
+from xlb.operator.boundary_condition.helper_functions_bc import MultiresEncodeAuxiliaryData
 
 
 class MultiresIncompressibleNavierStokesStepper(Stepper):
@@ -249,7 +249,7 @@ def _initialize_auxiliary_data(boundary_conditions, f_1, bc_mask, missing_mask):
         for bc in boundary_conditions:
             if bc.needs_aux_init and not bc.is_initialized_with_aux_data:
                 # Create the encoder operator for storing the auxiliary data
-                encode_auxiliary_data = MultiresEncodeInitialAuxiliaryData(
+                encode_auxiliary_data = MultiresEncodeAuxiliaryData(
                     bc.id,
                     bc.num_of_aux_data,
                     bc.profile,
@@ -258,7 +258,10 @@ def _initialize_auxiliary_data(boundary_conditions, f_1, bc_mask, missing_mask):
                     compute_backend=bc.compute_backend,
                 )
 
-                # Store the auxiliary data in f_1
+                # Assign the object to the BC for its "decoding" tasks
+                bc.encode_auxiliary_data = encode_auxiliary_data
+
+                # Encode the auxiliary data in f_1
                 f_1 = encode_auxiliary_data(f_1, bc_mask, missing_mask, stream=0)
                 bc.is_initialized_with_aux_data = True
         return f_1
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 56eb5043..0136d4a4 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -29,7 +29,7 @@
 )
 from xlb.helper import check_bc_overlaps
 from xlb.helper.nse_fields import create_nse_fields
-from xlb.operator.boundary_condition.helper_functions_bc import EncodeInitialAuxiliaryData
+from xlb.operator.boundary_condition.helper_functions_bc import EncodeAuxiliaryData
 
 
 class IncompressibleNavierStokesStepper(Stepper):
@@ -170,7 +170,7 @@ def _initialize_auxiliary_data(boundary_conditions, f_1, bc_mask, missing_mask):
         for bc in boundary_conditions:
             if bc.needs_aux_init and not bc.is_initialized_with_aux_data:
                 # Create the encoder operator for storing the auxiliary data
-                encode_auxiliary_data = EncodeInitialAuxiliaryData(
+                encode_auxiliary_data = EncodeAuxiliaryData(
                     bc.id,
                     bc.num_of_aux_data,
                     bc.profile,
@@ -179,7 +179,10 @@ def _initialize_auxiliary_data(boundary_conditions, f_1, bc_mask, missing_mask):
                     compute_backend=bc.compute_backend,
                 )
 
-                # Store the auxiliary data in f_1
+                # Assign the object to the BC for its "decoding" tasks
+                bc.encode_auxiliary_data = encode_auxiliary_data
+
+                # Encode the auxiliary data in f_1
                 f_1 = encode_auxiliary_data(f_1, bc_mask, missing_mask)
                 bc.is_initialized_with_aux_data = True
         return f_1

From 15583ad4c694df87e051c03c246b7282b3c3b356 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 22 Aug 2025 18:43:23 -0400
Subject: [PATCH 161/208] Major improvements to the handling of user-specified
 profiles in hybridBC

---
 examples/cfd/rotating_sphere_3d.py            |   2 +-
 xlb/operator/boundary_condition/__init__.py   |   2 +-
 xlb/operator/boundary_condition/bc_hybrid.py  | 137 ++++++++++++++----
 xlb/operator/boundary_condition/bc_zouhe.py   |  16 +-
 .../boundary_condition/helper_functions_bc.py |   2 +-
 xlb/operator/boundary_masker/multires_aabb.py |   6 +
 xlb/operator/stepper/nse_stepper.py           |  16 +-
 7 files changed, 129 insertions(+), 52 deletions(-)

diff --git a/examples/cfd/rotating_sphere_3d.py b/examples/cfd/rotating_sphere_3d.py
index 008ab71e..03906640 100644
--- a/examples/cfd/rotating_sphere_3d.py
+++ b/examples/cfd/rotating_sphere_3d.py
@@ -112,7 +112,7 @@ def bc_profile():
     origin_wp = _u_vec(origin_np[0], origin_np[1], origin_np[2])
 
     @wp.func
-    def bc_profile_warp(index: wp.vec3i, time: Any):
+    def bc_profile_warp(index: wp.vec3i):
         x = dtype(index[0])
         y = dtype(index[1])
         z = dtype(index[2])
diff --git a/xlb/operator/boundary_condition/__init__.py b/xlb/operator/boundary_condition/__init__.py
index b3c814b5..8be2f226 100644
--- a/xlb/operator/boundary_condition/__init__.py
+++ b/xlb/operator/boundary_condition/__init__.py
@@ -1,4 +1,4 @@
-from xlb.operator.boundary_condition.helper_functions_bc import HelperFunctionsBC, EncodeInitialAuxiliaryData, MultiresEncodeInitialAuxiliaryData
+from xlb.operator.boundary_condition.helper_functions_bc import HelperFunctionsBC, EncodeAuxiliaryData, MultiresEncodeAuxiliaryData
 from xlb.operator.boundary_condition.boundary_condition import BoundaryCondition
 from xlb.operator.boundary_condition.boundary_condition_registry import BoundaryConditionRegistry
 from xlb.operator.boundary_condition.bc_equilibrium import EquilibriumBC
diff --git a/xlb/operator/boundary_condition/bc_hybrid.py b/xlb/operator/boundary_condition/bc_hybrid.py
index e403a76c..99efbe21 100644
--- a/xlb/operator/boundary_condition/bc_hybrid.py
+++ b/xlb/operator/boundary_condition/bc_hybrid.py
@@ -1,3 +1,4 @@
+import inspect
 from jax import jit
 from functools import partial
 import warp as wp
@@ -16,7 +17,7 @@
     HelperFunctionsBC,
 )
 from xlb.operator.boundary_masker.mesh_voxelization_method import MeshVoxelizationMethod
-
+from xlb.operator.boundary_condition.helper_functions_bc import EncodeAuxiliaryData
 
 class HybridBC(BoundaryCondition):
     """
@@ -59,6 +60,10 @@ def __init__(
             voxelization_method,
         )
 
+        # Raise error if used for 2d examples:
+        if self.velocity_set.d == 2:
+            raise NotImplementedError("This BC is not implemented in 2D!")
+
         # Check if the compute backend is Warp
         assert self.compute_backend == ComputeBackend.WARP or ComputeBackend.NEON, "This BC is currently not supported by JAX backend!"
 
@@ -67,9 +72,13 @@ def __init__(
         self.macroscopic = Macroscopic(compute_backend=ComputeBackend.WARP)
         self.equilibrium = QuadraticEquilibrium(compute_backend=ComputeBackend.WARP)
 
-        # This BC class accepts both constant prescribed values of velocity with keyword "prescribed_value" or
-        # velocity profiles given by keyword "profile" which must be a callable function.
-        self.profile = profile
+        # Define BC helper functions. Explicitly using the WARP backend for helper functions as it may also be called by the Neon backend.
+        self.bc_helper = HelperFunctionsBC(
+            velocity_set=self.velocity_set,
+            precision_policy=self.precision_policy,
+            compute_backend=ComputeBackend.WARP,
+            distance_decoder_function=self._construct_distance_decoder_function(),
+        )
 
         # A flag to enable moving wall treatment when either "prescribed_value" or "profile" are provided.
         self.needs_moving_wall_treatment = False
@@ -84,8 +93,7 @@ def __init__(
 
         # Handle prescribed value if provided
         if prescribed_value is not None:
-            if profile is not None:
-                raise ValueError("Cannot specify both profile and prescribed_value")
+            assert profile is None, "Cannot specify both profile and prescribed_value"
 
             # Ensure prescribed_value is a NumPy array of floats
             if isinstance(prescribed_value, (tuple, list, np.ndarray)):
@@ -103,10 +111,23 @@ def __init__(
             prescribed_value = _u_vec(prescribed_value)
 
             @wp.func
-            def prescribed_profile_warp(index: Any, time: Any):
+            def prescribed_profile_warp(index: Any):
                 return _u_vec(prescribed_value[0], prescribed_value[1], prescribed_value[2])
 
-            self.profile = prescribed_profile_warp
+            profile = prescribed_profile_warp
+
+        # Inspect the function signature and add time parameter if needed
+        self.is_time_dependent = False
+        sig = inspect.signature(profile)
+        if len(sig.parameters) > 1:
+            # We assume the profile function takes only the index as input and is hence time-independent.
+            # In case it is defined with more than 1 input, we assume the second input is time and create
+            # a wrapper function that also accepts time as a parameter.
+            self.is_time_dependent = True
+
+        # This BC class accepts both constant prescribed values of velocity with keyword "prescribed_value" or
+        # velocity profiles given by keyword "profile" which must be a callable function.
+        self.profile = self._construct_profile(profile)
 
         # Set whether this BC needs mesh distance
         self.needs_mesh_distance = use_mesh_distance
@@ -126,17 +147,63 @@ def prescribed_profile_warp(index: Any, time: Any):
         else:
             assert self.indices is None, "Cannot use indices with mesh vertices! Please provide mesh vertices only."
 
-        # Define BC helper functions. Explicitly using the WARP backend for helper functions as it may also be called by the Neon backend.
-        self.bc_helper = HelperFunctionsBC(
-            velocity_set=self.velocity_set,
-            precision_policy=self.precision_policy,
-            compute_backend=ComputeBackend.WARP,
-            distance_decoder_function=self._construct_distance_decoder_function(),
-        )
+        if not (self.needs_mesh_distance or self.is_time_dependent):
+            # In the following two cases we simply call the user-defined profile warp function
+            # (i)  mesh distance data are already stored in f_1
+            # (ii) the user-defined functional is time-dependent and cannot be stored only once during initialization
+            # This BC needs auxiliary data initialization before streaming
+            self.needs_aux_init = True
 
-        # Raise error if used for 2d examples:
-        if self.velocity_set.d == 2:
-            raise NotImplementedError("This BC is not implemented in 2D!")
+            # This BC needs auxiliary data recovery after streaming
+            self.needs_aux_recovery = True
+
+            # This BC needs one auxiliary data for the density or normal velocity
+            # The user prescribed function for velocity profile (eg. rotating velocity) can be stored and retrived using f_1
+            self.num_of_aux_data = 3
+
+            # Create the encoder operator for storing the auxiliary data
+            self.encode_auxiliary_data = EncodeAuxiliaryData(
+                self.id,
+                self.num_of_aux_data,
+                self.profile,
+                velocity_set=self.velocity_set,
+                precision_policy=self.precision_policy,
+                compute_backend=self.compute_backend,
+            )
+
+        # Define the profile decoder functional
+        self.profile_decoder_functional = self._construct_profile_decoder_functional()
+
+    def _construct_profile(self, profile):
+        """
+        This function wraps the user-specified profile which is a warp function with required input arguments.
+        TODO:
+        We ONLY impose a profile on a boundary which requires mesh-distance if that boundary lives on the finest level.
+        This is because I don't know how to extract "level" from the
+        "neon_field_hdl" to do:
+               cIdx = wp.neon_global_idx(field_neon_hdl, index)
+               gx = wp.neon_get_x(cIdx) // 2 ** level
+               gy = wp.neon_get_y(cIdx) // 2 ** level
+               gz = wp.neon_get_z(cIdx) // 2 ** level
+        """
+
+        # The following wrappers are simply to enable "decoder_functional" calls to have the same signature (see below)
+        @wp.func
+        def wrapped_profile_warp(field: Any, index: Any, timestep: Any, _missing_mask: Any):
+            if wp.static(self.is_time_dependent):
+                return profile(index, timestep)
+            else:
+                return profile(index)
+
+        @wp.func
+        def wrapped_profile_neon(field: Any, index: Any, timestep: Any, _missing_mask: Any):
+            index_wp = self.bc_helper.neon_index_to_warp(field, index)
+            if wp.static(self.is_time_dependent):
+                return profile(index_wp, timestep)
+            else:
+                return profile(index_wp)
+
+        return wrapped_profile_neon if self.compute_backend == ComputeBackend.NEON else wrapped_profile_warp
 
     @Operator.register_backend(ComputeBackend.JAX)
     @partial(jit, static_argnums=(0))
@@ -151,21 +218,31 @@ def _construct_distance_decoder_function(self):
         _opp_indices = self.velocity_set.opp_indices
 
         # Define the distance decoder function for this BC
-        if self.compute_backend == ComputeBackend.WARP:
+        @wp.func
+        def distance_decoder_function(f_1: Any, index: Any, direction: Any):
+            return self.read_field(f_1, index, _opp_indices[direction])
 
-            @wp.func
-            def distance_decoder_function(f_1: Any, index: Any, direction: Any):
-                return f_1[_opp_indices[direction], index[0], index[1], index[2]]
+        return distance_decoder_function
 
-        elif self.compute_backend == ComputeBackend.NEON:
+    def _construct_profile_decoder_functional(self):
+        """
+        Get the profile decoder functional for this BC.
+        """
+        # Get decoder functional
+        if self.needs_mesh_distance or self.is_time_dependent:
+            # In the following two cases we simply call the user-defined profile warp function
+            # (i)  mesh distance data are already stored in f_1
+            # (ii) the user-defined functional is time-dependent and cannot be stored only once during initialization
+            decoder_functional = self.profile
+        else:
 
             @wp.func
-            def distance_decoder_function(f_1_pn: Any, index: Any, direction: Any):
-                return wp.neon_read(f_1_pn, index, _opp_indices[direction])
-
-        return distance_decoder_function
+            def decoder_functional(f_1: Any, index: Any, timestep: Any, _missing_mask: Any):
+                return self.encode_auxiliary_data.warp_functional["decode"](f_1, index, _missing_mask)
+        return decoder_functional
 
     def _construct_warp(self):
+
         # Construct the functionals for this BC
         @wp.func
         def hybrid_bounceback_regularized(
@@ -185,7 +262,7 @@ def hybrid_bounceback_regularized(
             #     in: 41st aerospace sciences meeting and exhibit, p. 953.
 
             # Apply interpolated bounceback first to find missing populations at the boundary
-            u_wall = self.profile(index, timestep)
+            u_wall = self.profile_decoder_functional(f_1, index, timestep, _missing_mask)
             f_post = self.bc_helper.interpolated_bounceback(
                 index,
                 _missing_mask,
@@ -224,7 +301,7 @@ def hybrid_bounceback_grads(
             #     in: 41st aerospace sciences meeting and exhibit, p. 953.
 
             # Apply interpolated bounceback first to find missing populations at the boundary
-            u_wall = self.profile(index, timestep)
+            u_wall = self.profile_decoder_functional(f_1, index, timestep, _missing_mask)
             f_post = self.bc_helper.interpolated_bounceback(
                 index,
                 _missing_mask,
@@ -262,7 +339,7 @@ def hybrid_nonequilibrium_regularized(
             #     boundaries in the lattice Boltzmann method. Physical Review E 77, 056703.
 
             # Apply interpolated bounceback first to find missing populations at the boundary
-            u_wall = self.profile(index, timestep)
+            u_wall = self.profile_decoder_functional(f_1, index, timestep, _missing_mask)
             f_post = self.bc_helper.interpolated_nonequilibrium_bounceback(
                 index,
                 _missing_mask,
diff --git a/xlb/operator/boundary_condition/bc_zouhe.py b/xlb/operator/boundary_condition/bc_zouhe.py
index 6883afeb..9c27aba4 100644
--- a/xlb/operator/boundary_condition/bc_zouhe.py
+++ b/xlb/operator/boundary_condition/bc_zouhe.py
@@ -21,6 +21,7 @@
 from xlb.operator.boundary_condition import HelperFunctionsBC
 from xlb.operator.equilibrium import QuadraticEquilibrium
 from xlb.operator.boundary_masker.mesh_voxelization_method import MeshVoxelizationMethod
+from xlb.operator.boundary_condition.helper_functions_bc import EncodeAuxiliaryData
 
 
 class ZouHeBC(BoundaryCondition):
@@ -114,8 +115,15 @@ def __init__(
             # This BC needs one auxiliary data for the density or normal velocity
             self.num_of_aux_data = 1
 
-            # A placeholder for encoder-decoder object
-            self.encode_auxiliary_data = None
+            # Create the encoder operator for storing the auxiliary data
+            self.encode_auxiliary_data = EncodeAuxiliaryData(
+                self.id,
+                self.num_of_aux_data,
+                self.profile,
+                velocity_set=self.velocity_set,
+                precision_policy=self.precision_policy,
+                compute_backend=self.compute_backend,
+            )
 
         # This BC needs padding for finding missing directions when imposed on a geometry that is in the domain interior
         self.needs_padding = True
@@ -311,7 +319,7 @@ def functional_velocity(
             # Find the value of u from the missing directions
             # Since we are only considering normal velocity, we only need to find one value (stored at the center of f_1)
             # Create velocity vector by multiplying the prescribed value with the normal vector
-            prescribed_value = decoder_functional(index, _missing_mask, f_1)
+            prescribed_value = decoder_functional(f_1, index, _missing_mask)
             _u = -prescribed_value * normals
 
             for d in range(_d):
@@ -342,7 +350,7 @@ def functional_pressure(
 
             # Find the value of rho from the missing directions
             # Since we need only one scalar value, we only need to find one value (stored at the center of f_1)
-            _rho = decoder_functional(index, _missing_mask, f_1)
+            _rho = decoder_functional(f_1, index, _missing_mask)
 
             # calculate velocity
             fsum = bc_helper.get_bc_fsum(_f, _missing_mask)
diff --git a/xlb/operator/boundary_condition/helper_functions_bc.py b/xlb/operator/boundary_condition/helper_functions_bc.py
index ddd73ee5..02b137df 100644
--- a/xlb/operator/boundary_condition/helper_functions_bc.py
+++ b/xlb/operator/boundary_condition/helper_functions_bc.py
@@ -418,9 +418,9 @@ def encoder_functional(
 
         @wp.func
         def decoder_functional(
+            field_storage: Any,
             index: Any,
             _missing_mask: Any,
-            field_storage: Any,
         ):
             """
             Decode the encoded values needed for the boundary condition treatment from the center location in field_storage.
diff --git a/xlb/operator/boundary_masker/multires_aabb.py b/xlb/operator/boundary_masker/multires_aabb.py
index d7633c07..eb8237f6 100644
--- a/xlb/operator/boundary_masker/multires_aabb.py
+++ b/xlb/operator/boundary_masker/multires_aabb.py
@@ -15,6 +15,12 @@ class MultiresMeshMaskerAABB(MeshMaskerAABB):
     This implementation uses warp.mesh_query_aabb for efficient mesh-voxel intersection testing,
     providing approximate 1-voxel thick surface detection around the mesh geometry.
     Suitable for scenarios where fast, approximate boundary detection is sufficient.
+    TODO:
+    We cannot properly mask a mesh file if it lives on any level other than the finest. This issue can be easily solved by adding
+           gx = wp.neon_get_x(cIdx) // 2 ** level
+           gy = wp.neon_get_y(cIdx) // 2 ** level
+           gz = wp.neon_get_z(cIdx) // 2 ** level
+    to the "neon_index_to_warp" and subsequently add "level" to the arguments of "index_to_position_neon", "get_pull_index_neon" and "is_in_bc_indices_neon"
     """
 
     def __init__(
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 0136d4a4..d2037c61 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -29,7 +29,6 @@
 )
 from xlb.helper import check_bc_overlaps
 from xlb.helper.nse_fields import create_nse_fields
-from xlb.operator.boundary_condition.helper_functions_bc import EncodeAuxiliaryData
 
 
 class IncompressibleNavierStokesStepper(Stepper):
@@ -169,21 +168,8 @@ def _initialize_auxiliary_data(boundary_conditions, f_1, bc_mask, missing_mask):
         """Initialize auxiliary data for boundary conditions that require it."""
         for bc in boundary_conditions:
             if bc.needs_aux_init and not bc.is_initialized_with_aux_data:
-                # Create the encoder operator for storing the auxiliary data
-                encode_auxiliary_data = EncodeAuxiliaryData(
-                    bc.id,
-                    bc.num_of_aux_data,
-                    bc.profile,
-                    velocity_set=bc.velocity_set,
-                    precision_policy=bc.precision_policy,
-                    compute_backend=bc.compute_backend,
-                )
-
-                # Assign the object to the BC for its "decoding" tasks
-                bc.encode_auxiliary_data = encode_auxiliary_data
-
                 # Encode the auxiliary data in f_1
-                f_1 = encode_auxiliary_data(f_1, bc_mask, missing_mask)
+                f_1 = bc.encode_auxiliary_data(f_1, bc_mask, missing_mask)
                 bc.is_initialized_with_aux_data = True
         return f_1
 

From a2980bea2e123b8018d74b292b3874e22ea1d2e3 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 22 Aug 2025 23:48:54 -0400
Subject: [PATCH 162/208] fixed bugs in ZouHe and Regularized after changes

---
 .../boundary_condition/bc_regularized.py      | 32 ++++++-------------
 xlb/operator/boundary_condition/bc_zouhe.py   | 11 ++++---
 .../boundary_condition/helper_functions_bc.py |  5 +--
 3 files changed, 19 insertions(+), 29 deletions(-)

diff --git a/xlb/operator/boundary_condition/bc_regularized.py b/xlb/operator/boundary_condition/bc_regularized.py
index c1fa5286..8f369c47 100644
--- a/xlb/operator/boundary_condition/bc_regularized.py
+++ b/xlb/operator/boundary_condition/bc_regularized.py
@@ -137,7 +137,7 @@ def _construct_warp(self):
         def functional_velocity(
             index: Any,
             timestep: Any,
-            missing_mask: Any,
+            _missing_mask: Any,
             f_0: Any,
             f_1: Any,
             f_pre: Any,
@@ -147,16 +147,16 @@ def functional_velocity(
             _f = f_post
 
             # Find normal vector
-            normals = bc_helper.get_normal_vectors(missing_mask)
+            normals = bc_helper.get_normal_vectors(_missing_mask)
 
             # Find the value of u from the missing directions
             # Since we are only considering normal velocity, we only need to find one value (stored at the center of f_1)
             # Create velocity vector by multiplying the prescribed value with the normal vector
-            prescribed_value = decode_lattice_center_value(index, f_1)
+            prescribed_value = self.decoder_functional(f_1, index, _missing_mask)[0]
             _u = -prescribed_value * normals
 
             # calculate rho
-            fsum = bc_helper.get_bc_fsum(_f, missing_mask)
+            fsum = bc_helper.get_bc_fsum(_f, _missing_mask)
             unormal = self.compute_dtype(0.0)
             for d in range(_d):
                 unormal += _u[d] * normals[d]
@@ -164,7 +164,7 @@ def functional_velocity(
 
             # impose non-equilibrium bounceback
             feq = self.equilibrium_operator.warp_functional(_rho, _u)
-            _f = bc_helper.bounceback_nonequilibrium(_f, feq, missing_mask)
+            _f = bc_helper.bounceback_nonequilibrium(_f, feq, _missing_mask)
 
             # Regularize the boundary fpop
             _f = bc_helper.regularize_fpop(_f, feq)
@@ -174,7 +174,7 @@ def functional_velocity(
         def functional_pressure(
             index: Any,
             timestep: Any,
-            missing_mask: Any,
+            _missing_mask: Any,
             f_0: Any,
             f_1: Any,
             f_pre: Any,
@@ -184,37 +184,25 @@ def functional_pressure(
             _f = f_post
 
             # Find normal vector
-            normals = bc_helper.get_normal_vectors(missing_mask)
+            normals = bc_helper.get_normal_vectors(_missing_mask)
 
             # Find the value of rho from the missing directions
             # Since we need only one scalar value, we only need to find one value (stored at the center of f_1)
-            _rho = decode_lattice_center_value(index, f_1)
+            _rho = self.decoder_functional(f_1, index, _missing_mask)[0]
 
             # calculate velocity
-            fsum = bc_helper.get_bc_fsum(_f, missing_mask)
+            fsum = bc_helper.get_bc_fsum(_f, _missing_mask)
             unormal = -self.compute_dtype(1.0) + fsum / _rho
             _u = unormal * normals
 
             # impose non-equilibrium bounceback
             feq = self.equilibrium_operator.warp_functional(_rho, _u)
-            _f = bc_helper.bounceback_nonequilibrium(_f, feq, missing_mask)
+            _f = bc_helper.bounceback_nonequilibrium(_f, feq, _missing_mask)
 
             # Regularize the boundary fpop
             _f = bc_helper.regularize_fpop(_f, feq)
             return _f
 
-        @wp.func
-        def decode_lattice_center_value(index: Any, f_1: Any):
-            """
-            Decode the encoded values needed for the boundary condition treatment from the center location in f_1.
-            """
-            if wp.static(self.compute_backend == ComputeBackend.WARP):
-                value = f_1[lattice_central_index, index[0], index[1], index[2]]
-            else:
-                # Note: in Neon case, f_1 is a pointer to the field not the actual data.
-                value = wp.neon_read(f_1, index, lattice_central_index)
-            return self.compute_dtype(value)
-
         if self.bc_type == "velocity":
             functional = functional_velocity
         elif self.bc_type == "pressure":
diff --git a/xlb/operator/boundary_condition/bc_zouhe.py b/xlb/operator/boundary_condition/bc_zouhe.py
index 9c27aba4..881160b7 100644
--- a/xlb/operator/boundary_condition/bc_zouhe.py
+++ b/xlb/operator/boundary_condition/bc_zouhe.py
@@ -125,6 +125,10 @@ def __init__(
                 compute_backend=self.compute_backend,
             )
 
+            # get decoder functional
+            functional_dict, _ = self.encode_auxiliary_data._construct_warp()
+            self.decoder_functional = functional_dict["decoder"]
+
         # This BC needs padding for finding missing directions when imposed on a geometry that is in the domain interior
         self.needs_padding = True
 
@@ -290,9 +294,6 @@ def _construct_warp(self):
         # load helper functions. Always use warp backend for helper functions as it may also be called by the Neon backend.
         bc_helper = HelperFunctionsBC(velocity_set=self.velocity_set, precision_policy=self.precision_policy, compute_backend=ComputeBackend.WARP)
 
-        # get decoder functional
-        decoder_functional = self.encode_auxiliary_data.warp_functional["decode"]
-
         # Set local constants
         _d = self.velocity_set.d
 
@@ -319,7 +320,7 @@ def functional_velocity(
             # Find the value of u from the missing directions
             # Since we are only considering normal velocity, we only need to find one value (stored at the center of f_1)
             # Create velocity vector by multiplying the prescribed value with the normal vector
-            prescribed_value = decoder_functional(f_1, index, _missing_mask)
+            prescribed_value = self.decoder_functional(f_1, index, _missing_mask)[0]
             _u = -prescribed_value * normals
 
             for d in range(_d):
@@ -350,7 +351,7 @@ def functional_pressure(
 
             # Find the value of rho from the missing directions
             # Since we need only one scalar value, we only need to find one value (stored at the center of f_1)
-            _rho = decoder_functional(f_1, index, _missing_mask)
+            _rho = self.decoder_functional(f_1, index, _missing_mask)[0]
 
             # calculate velocity
             fsum = bc_helper.get_bc_fsum(_f, _missing_mask)
diff --git a/xlb/operator/boundary_condition/helper_functions_bc.py b/xlb/operator/boundary_condition/helper_functions_bc.py
index 02b137df..0983b646 100644
--- a/xlb/operator/boundary_condition/helper_functions_bc.py
+++ b/xlb/operator/boundary_condition/helper_functions_bc.py
@@ -367,7 +367,7 @@ def __init__(
         # We assume the profile function takes only the index as input and is hence time-independent.
         sig = inspect.signature(user_defined_functional)
         assert self.compute_backend != ComputeBackend.JAX, "Encoding/decoding of auxiliary data are not required for boundary conditions in JAX"
-        assert len(sig.parameters) == 1, "User-defined functional must take exactly one argument (the index)."
+        assert len(sig.parameters) == 1, f"User-defined functional must take exactly one argument (the index), it received {len(sig.parameters)}."
 
         # Define a HelperFunctionsBC instance
         self.bc_helper = HelperFunctionsBC(
@@ -387,6 +387,7 @@ def _construct_warp(self):
         _opp_indices = self.velocity_set.opp_indices
         _id = self.boundary_id
         _num_of_aux_data = self.num_of_aux_data
+        _aux_vec = wp.vec(_num_of_aux_data, dtype=self.compute_dtype)
 
         @wp.func
         def encoder_functional(
@@ -427,7 +428,7 @@ def decoder_functional(
             """
 
             # Define a vector to hold prescribed_values
-            prescribed_values = wp.vec(_num_of_aux_data, dtype=self.compute_dtype)
+            prescribed_values = _aux_vec()
 
             # Read all q directions, but only retrieve up to num_of_aux_data
             counter = wp.int32(0)

From 1dc0df33e1bf5d313f691fd24b94d05577c7eb7e Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Sat, 23 Aug 2025 00:15:46 -0400
Subject: [PATCH 163/208] HybridBC now working as expected with profiles

---
 xlb/operator/boundary_condition/bc_hybrid.py | 65 +++++++-------------
 1 file changed, 23 insertions(+), 42 deletions(-)

diff --git a/xlb/operator/boundary_condition/bc_hybrid.py b/xlb/operator/boundary_condition/bc_hybrid.py
index 99efbe21..8ad3d555 100644
--- a/xlb/operator/boundary_condition/bc_hybrid.py
+++ b/xlb/operator/boundary_condition/bc_hybrid.py
@@ -19,6 +19,7 @@
 from xlb.operator.boundary_masker.mesh_voxelization_method import MeshVoxelizationMethod
 from xlb.operator.boundary_condition.helper_functions_bc import EncodeAuxiliaryData
 
+
 class HybridBC(BoundaryCondition):
     """
     The hybrid BC methods in this boundary condition have been originally developed by H. Salehipour and are inspired from
@@ -127,7 +128,7 @@ def prescribed_profile_warp(index: Any):
 
         # This BC class accepts both constant prescribed values of velocity with keyword "prescribed_value" or
         # velocity profiles given by keyword "profile" which must be a callable function.
-        self.profile = self._construct_profile(profile)
+        self.profile = profile
 
         # Set whether this BC needs mesh distance
         self.needs_mesh_distance = use_mesh_distance
@@ -171,40 +172,13 @@ def prescribed_profile_warp(index: Any):
                 compute_backend=self.compute_backend,
             )
 
+            # Get auxiliary decoder functional
+            functional_dict, _ = self.encode_auxiliary_data._construct_warp()
+            self.decoder_functional = functional_dict["decoder"]
+
         # Define the profile decoder functional
         self.profile_decoder_functional = self._construct_profile_decoder_functional()
 
-    def _construct_profile(self, profile):
-        """
-        This function wraps the user-specified profile which is a warp function with required input arguments.
-        TODO:
-        We ONLY impose a profile on a boundary which requires mesh-distance if that boundary lives on the finest level.
-        This is because I don't know how to extract "level" from the
-        "neon_field_hdl" to do:
-               cIdx = wp.neon_global_idx(field_neon_hdl, index)
-               gx = wp.neon_get_x(cIdx) // 2 ** level
-               gy = wp.neon_get_y(cIdx) // 2 ** level
-               gz = wp.neon_get_z(cIdx) // 2 ** level
-        """
-
-        # The following wrappers are simply to enable "decoder_functional" calls to have the same signature (see below)
-        @wp.func
-        def wrapped_profile_warp(field: Any, index: Any, timestep: Any, _missing_mask: Any):
-            if wp.static(self.is_time_dependent):
-                return profile(index, timestep)
-            else:
-                return profile(index)
-
-        @wp.func
-        def wrapped_profile_neon(field: Any, index: Any, timestep: Any, _missing_mask: Any):
-            index_wp = self.bc_helper.neon_index_to_warp(field, index)
-            if wp.static(self.is_time_dependent):
-                return profile(index_wp, timestep)
-            else:
-                return profile(index_wp)
-
-        return wrapped_profile_neon if self.compute_backend == ComputeBackend.NEON else wrapped_profile_warp
-
     @Operator.register_backend(ComputeBackend.JAX)
     @partial(jit, static_argnums=(0))
     def jax_implementation(self, f_pre, f_post, bc_mask, missing_mask):
@@ -227,22 +201,29 @@ def distance_decoder_function(f_1: Any, index: Any, direction: Any):
     def _construct_profile_decoder_functional(self):
         """
         Get the profile decoder functional for this BC.
+        Note:
+        We can impose a profile on a boundary which requires mesh-distance only if that boundary lives on the finest level.
+        This is because I don't know how to extract "level" from the
+        "neon_field_hdl" to do:
+               cIdx = wp.neon_global_idx(field_neon_hdl, index)
+               gx = wp.neon_get_x(cIdx) // 2 ** level
+               gy = wp.neon_get_y(cIdx) // 2 ** level
+               gz = wp.neon_get_z(cIdx) // 2 ** level
         """
+
         # Get decoder functional
-        if self.needs_mesh_distance or self.is_time_dependent:
-            # In the following two cases we simply call the user-defined profile warp function
-            # (i)  mesh distance data are already stored in f_1
-            # (ii) the user-defined functional is time-dependent and cannot be stored only once during initialization
-            decoder_functional = self.profile
-        else:
+        @wp.func
+        def decoder_functional(f_1: Any, index: Any, timestep: Any, _missing_mask: Any):
+            if wp.static(self.is_time_dependent):
+                return self.profile(index, timestep)
+            elif wp.static(self.needs_mesh_distance and not self.is_time_dependent):
+                return self.profile(index)
+            else:
+                return self.decoder_functional(f_1, index, _missing_mask)
 
-            @wp.func
-            def decoder_functional(f_1: Any, index: Any, timestep: Any, _missing_mask: Any):
-                return self.encode_auxiliary_data.warp_functional["decode"](f_1, index, _missing_mask)
         return decoder_functional
 
     def _construct_warp(self):
-
         # Construct the functionals for this BC
         @wp.func
         def hybrid_bounceback_regularized(

From 7c41c5fb0459ae99785c3af19892ae780cfff49d Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Sat, 23 Aug 2025 00:36:59 -0400
Subject: [PATCH 164/208] improved the logic a bit. needs more testing

---
 xlb/operator/boundary_condition/bc_hybrid.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/xlb/operator/boundary_condition/bc_hybrid.py b/xlb/operator/boundary_condition/bc_hybrid.py
index 8ad3d555..e7b79115 100644
--- a/xlb/operator/boundary_condition/bc_hybrid.py
+++ b/xlb/operator/boundary_condition/bc_hybrid.py
@@ -81,6 +81,9 @@ def __init__(
             distance_decoder_function=self._construct_distance_decoder_function(),
         )
 
+        # A flag to track if available space in "f_1" for storing auxiliary data is full
+        self.auxiliary_storage_space_full = False
+
         # A flag to enable moving wall treatment when either "prescribed_value" or "profile" are provided.
         self.needs_moving_wall_treatment = False
 
@@ -126,6 +129,9 @@ def prescribed_profile_warp(index: Any):
             # a wrapper function that also accepts time as a parameter.
             self.is_time_dependent = True
 
+            # For time dependent prescribed values, we cannot store at initialization
+            self.auxiliary_storage_space_full = True
+
         # This BC class accepts both constant prescribed values of velocity with keyword "prescribed_value" or
         # velocity profiles given by keyword "profile" which must be a callable function.
         self.profile = profile
@@ -137,6 +143,7 @@ def prescribed_profile_warp(index: Any):
         if self.needs_mesh_distance:
             # This BC needs auxiliary data recovery after streaming
             self.needs_aux_recovery = True
+            self.auxiliary_storage_space_full = True
 
         # If this BC is defined using indices, it would need padding in order to find missing directions
         # when imposed on a geometry that is in the domain interior
@@ -148,7 +155,7 @@ def prescribed_profile_warp(index: Any):
         else:
             assert self.indices is None, "Cannot use indices with mesh vertices! Please provide mesh vertices only."
 
-        if not (self.needs_mesh_distance or self.is_time_dependent):
+        if not self.auxiliary_storage_space_full:
             # In the following two cases we simply call the user-defined profile warp function
             # (i)  mesh distance data are already stored in f_1
             # (ii) the user-defined functional is time-dependent and cannot be stored only once during initialization
@@ -214,12 +221,12 @@ def _construct_profile_decoder_functional(self):
         # Get decoder functional
         @wp.func
         def decoder_functional(f_1: Any, index: Any, timestep: Any, _missing_mask: Any):
-            if wp.static(self.is_time_dependent):
+            if wp.static(not self.auxiliary_storage_space_full):
+                return self.decoder_functional(f_1, index, _missing_mask)
+            elif wp.static(self.is_time_dependent):
                 return self.profile(index, timestep)
-            elif wp.static(self.needs_mesh_distance and not self.is_time_dependent):
-                return self.profile(index)
             else:
-                return self.decoder_functional(f_1, index, _missing_mask)
+                return self.profile(index)
 
         return decoder_functional
 

From 11e81292e98a755f75f900bf82f77c4e930a3297 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 1 Sep 2025 17:42:38 -0400
Subject: [PATCH 165/208] minor change: not attributing the EncodeAuxiliaryData
 object to the BC object but rather using it as needed.

---
 xlb/operator/boundary_condition/bc_hybrid.py |  4 ++--
 xlb/operator/boundary_condition/bc_zouhe.py  |  4 ++--
 xlb/operator/stepper/nse_multires_stepper.py |  3 ---
 xlb/operator/stepper/nse_stepper.py          | 13 ++++++++++++-
 4 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/xlb/operator/boundary_condition/bc_hybrid.py b/xlb/operator/boundary_condition/bc_hybrid.py
index e7b79115..b53c9cf7 100644
--- a/xlb/operator/boundary_condition/bc_hybrid.py
+++ b/xlb/operator/boundary_condition/bc_hybrid.py
@@ -170,7 +170,7 @@ def prescribed_profile_warp(index: Any):
             self.num_of_aux_data = 3
 
             # Create the encoder operator for storing the auxiliary data
-            self.encode_auxiliary_data = EncodeAuxiliaryData(
+            encode_auxiliary_data = EncodeAuxiliaryData(
                 self.id,
                 self.num_of_aux_data,
                 self.profile,
@@ -180,7 +180,7 @@ def prescribed_profile_warp(index: Any):
             )
 
             # Get auxiliary decoder functional
-            functional_dict, _ = self.encode_auxiliary_data._construct_warp()
+            functional_dict, _ = encode_auxiliary_data._construct_warp()
             self.decoder_functional = functional_dict["decoder"]
 
         # Define the profile decoder functional
diff --git a/xlb/operator/boundary_condition/bc_zouhe.py b/xlb/operator/boundary_condition/bc_zouhe.py
index 881160b7..5a940c62 100644
--- a/xlb/operator/boundary_condition/bc_zouhe.py
+++ b/xlb/operator/boundary_condition/bc_zouhe.py
@@ -116,7 +116,7 @@ def __init__(
             self.num_of_aux_data = 1
 
             # Create the encoder operator for storing the auxiliary data
-            self.encode_auxiliary_data = EncodeAuxiliaryData(
+            encode_auxiliary_data = EncodeAuxiliaryData(
                 self.id,
                 self.num_of_aux_data,
                 self.profile,
@@ -126,7 +126,7 @@ def __init__(
             )
 
             # get decoder functional
-            functional_dict, _ = self.encode_auxiliary_data._construct_warp()
+            functional_dict, _ = encode_auxiliary_data._construct_warp()
             self.decoder_functional = functional_dict["decoder"]
 
         # This BC needs padding for finding missing directions when imposed on a geometry that is in the domain interior
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 7f6eb7c2..f64feeb9 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -258,9 +258,6 @@ def _initialize_auxiliary_data(boundary_conditions, f_1, bc_mask, missing_mask):
                     compute_backend=bc.compute_backend,
                 )
 
-                # Assign the object to the BC for its "decoding" tasks
-                bc.encode_auxiliary_data = encode_auxiliary_data
-
                 # Encode the auxiliary data in f_1
                 f_1 = encode_auxiliary_data(f_1, bc_mask, missing_mask, stream=0)
                 bc.is_initialized_with_aux_data = True
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index d2037c61..ad7b4384 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -29,6 +29,7 @@
 )
 from xlb.helper import check_bc_overlaps
 from xlb.helper.nse_fields import create_nse_fields
+from xlb.operator.boundary_condition.helper_functions_bc import EncodeAuxiliaryData
 
 
 class IncompressibleNavierStokesStepper(Stepper):
@@ -168,8 +169,18 @@ def _initialize_auxiliary_data(boundary_conditions, f_1, bc_mask, missing_mask):
         """Initialize auxiliary data for boundary conditions that require it."""
         for bc in boundary_conditions:
             if bc.needs_aux_init and not bc.is_initialized_with_aux_data:
+                # Create the encoder operator for storing the auxiliary data
+                encode_auxiliary_data = EncodeAuxiliaryData(
+                    bc.id,
+                    bc.num_of_aux_data,
+                    bc.profile,
+                    velocity_set=bc.velocity_set,
+                    precision_policy=bc.precision_policy,
+                    compute_backend=bc.compute_backend,
+                )
+
                 # Encode the auxiliary data in f_1
-                f_1 = bc.encode_auxiliary_data(f_1, bc_mask, missing_mask)
+                f_1 = encode_auxiliary_data(f_1, bc_mask, missing_mask)
                 bc.is_initialized_with_aux_data = True
         return f_1
 

From 14509b7d8791ba4401dd496a569bf835fdc71323 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Tue, 2 Sep 2025 14:32:00 +0200
Subject: [PATCH 166/208] update(api): propagating API changes to mres code.

---
 xlb/operator/boundary_masker/indices_boundary_masker.py | 5 ++++-
 xlb/operator/stepper/nse_multires_stepper.py            | 6 +++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 7d35195f..ee2deb25 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -331,7 +331,10 @@ def _prepare_kernel_inputs(self, bclist, grid_shape):
         # Convert to Warp arrays
         if self.compute_backend == ComputeBackend.NEON:
             grid = self.grid
-            ndevice = grid.bk.get_num_devices()
+            if grid is None:
+                ndevice = 1
+            else:
+                ndevice = grid.bk.get_num_devices()
             if ndevice == 1:
                 wp_bc_indices = wp.array(indices, dtype=wp.int32)
                 wp_id_numbers = wp.array(id_numbers, dtype=wp.uint8)
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 38b7a97e..17acbfe0 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -692,8 +692,8 @@ def finest_fused_pull(
             if level != 0:
                 # throw an exception
                 raise Exception("Only the finest level is supported for now")
-
-            num_levels = f_0_fd.get_grid().get_num_levels()
+            grid = f_0_fd.get_grid()
+            num_levels =grid.num_levels
 
             # if level != 0:
             #     # throw an exception
@@ -705,7 +705,7 @@ def finest_fused_pull(
             def finest_fused_pull_launcher(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
-                if level + 1 < f_0_fd.get_grid().get_num_levels():
+                if level + 1 < f_0_fd.get_grid().num_levels:
                     f_0_pn = loader.get_mres_write_handle(f_0_fd, neon.Loader.Operation.stencil_up)
                     f_1_pn = loader.get_mres_write_handle(f_1_fd, neon.Loader.Operation.stencil_up)
                 else:

From 2b35abfe9ba586a2414f5548db80aacd5c6b918e Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Tue, 2 Sep 2025 15:09:58 +0200
Subject: [PATCH 167/208] fix(extra): removing debug export operation for
 bc_mask.

---
 .../boundary_masker/indices_boundary_masker.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index ee2deb25..83c28e3a 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -533,11 +533,11 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
 
         # If there are no interior boundary conditions, skip the rest and retun early
         if not bc_interior:
-            wp.synchronize()
-            bc_mask.update_host(0)
-            wp.synchronize()
-            bc_mask.export_vti("bc_mask.vti", "m")
-            wp.synchronize()
+            # wp.synchronize()
+            # bc_mask.update_host(0)
+            # wp.synchronize()
+            # bc_mask.export_vti("bc_mask.vti", "m")
+            # wp.synchronize()
             return bc_mask, missing_mask
 
         # Prepare the second and third kernel inputs for only a subset of boundary conditions associated with the interior
@@ -555,9 +555,9 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
         )
         container_interior_bc_mask.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
 
-        wp.synchronize()
-        bc_mask.update_host(0)
-        wp.synchronize()
-        bc_mask.export_vti(f"{"bc_mask"}.vti", "u")
+        # wp.synchronize()
+        # bc_mask.update_host(0)
+        # wp.synchronize()
+        # bc_mask.export_vti(f"{"bc_mask"}.vti", "u")
 
         return bc_mask, missing_mask

From 7b857eac7d2f82b45734948e0a4eeb88f0579733 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Tue, 2 Sep 2025 17:17:22 +0200
Subject: [PATCH 168/208] style(ruff): running ruff

---
 examples/performance/mlups_3d.py              | 236 ++++++++++--------
 xlb/default_config.py                         |   1 +
 xlb/grid/neon_grid.py                         |  42 ++--
 .../indices_boundary_masker.py                |   1 +
 xlb/operator/stepper/nse_multires_stepper.py  |   2 +-
 xlb/operator/stepper/nse_stepper.py           |  36 ++-
 xlb/precision_policy.py                       |   4 +-
 xlb/utils/mesher.py                           |   2 +-
 8 files changed, 177 insertions(+), 147 deletions(-)

diff --git a/examples/performance/mlups_3d.py b/examples/performance/mlups_3d.py
index 12d4f9c8..28f55323 100644
--- a/examples/performance/mlups_3d.py
+++ b/examples/performance/mlups_3d.py
@@ -21,39 +21,52 @@ def parse_arguments():
     parser.add_argument("num_steps", type=int, help="Number of timesteps for the simulation")
     parser.add_argument("compute_backend", type=str, help="Backend for the simulation (jax, warp or neon)")
     parser.add_argument("precision", type=str, help="Precision for the simulation (e.g., fp32/fp32)")
-    parser.add_argument("--gpu_devices", type=str, default=None, help="List of the CUDA devices to use (e.g., --gpu_devices=[0,1,2]). This is only used for Neon backend.")
+    parser.add_argument(
+        "--gpu_devices",
+        type=str,
+        default=None,
+        help="List of the CUDA devices to use (e.g., --gpu_devices=[0,1,2]). This is only used for Neon backend.",
+    )
     # add a flat to choose between 19 or 27 velocity set
     parser.add_argument("--velocity_set", type=str, default="D3Q19", help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)")
-    # add a flat to choose between multi-gpu occ options based on the neon occ: 
-    parser.add_argument("--occ", type=str, default="standard", help="Overlapping Communication and Computation option (standard, none) (default: standard)")
+    # add a flat to choose between multi-gpu occ options based on the neon occ:
+    parser.add_argument(
+        "--occ", type=str, default="standard", help="Overlapping Communication and Computation option (standard, none) (default: standard)"
+    )
     parser.add_argument("--report", action="store_true", help="Generate a neon report file (default: disabled)")
     parser.add_argument("--export_final_velocity", action="store_true", help="Export the final velocity field to a vti file (default: disabled)")
     parser.add_argument("--measure_scalability", action="store_true", help="Measure scalability of the simulation (default: disabled)")
-    parser.add_argument("--repetitions", type=int, default=1, help="Number of repetitions for the simulation (default: 1) to get the average MLUPs and standard deviation")
+    parser.add_argument(
+        "--repetitions",
+        type=int,
+        default=1,
+        help="Number of repetitions for the simulation (default: 1) to get the average MLUPs and standard deviation",
+    )
 
     args = parser.parse_args()
-    
+
     # Parse gpu_devices string to list
     if args.gpu_devices is not None:
         try:
             import ast
+
             args.gpu_devices = ast.literal_eval(args.gpu_devices)
             if not isinstance(args.gpu_devices, list):
                 args.gpu_devices = [args.gpu_devices]  # Handle single integer case
         except (ValueError, SyntaxError):
             raise ValueError("Invalid gpu_devices format. Use format like [0,1,2] or [0]")
-    
+
     # Checking the compute backend and covert it to the right type
     compute_backend = None
     if args.compute_backend == "jax":
         compute_backend = ComputeBackend.JAX
-    elif args.compute_backend == "warp":    
+    elif args.compute_backend == "warp":
         compute_backend = ComputeBackend.WARP
     elif args.compute_backend == "neon":
         compute_backend = ComputeBackend.NEON
     else:
         raise ValueError("Invalid compute backend specified. Use 'jax', 'warp', or 'neon'.")
-    args.compute_backend = compute_backend 
+    args.compute_backend = compute_backend
 
     # Checking OCC
     if args.occ not in ["standard", "none"]:
@@ -63,6 +76,7 @@ def parse_arguments():
         args.gpu_devices = [0]
     if args.compute_backend == ComputeBackend.NEON:
         import neon
+
         occ = neon.SkeletonConfig.OCC.from_string(args.occ)
         args.occ = occ
 
@@ -81,7 +95,7 @@ def parse_arguments():
     # Checking velocity set
     if args.velocity_set not in ["D3Q19", "D3Q27"]:
         raise ValueError("Invalid velocity set. Use 'D3Q19' or 'D3Q27'.")
-    
+
     if args.velocity_set == "D3Q19":
         velocity_set = xlb.velocity_set.D3Q19(precision_policy=args.precision_policy, compute_backend=compute_backend)
     elif args.velocity_set == "D3Q27":
@@ -92,6 +106,7 @@ def parse_arguments():
 
     return args
 
+
 def print_args(args):
     # Print simulation configuration
     print("=" * 60)
@@ -110,13 +125,14 @@ def print_args(args):
     if args.compute_backend.name == "NEON":
         print(f"GPU Devices:          {args.gpu_devices}")
         # Convert the neon OCC enum back to string for display
-        occ_display = str(args.occ).split('.')[-1] if hasattr(args.occ, '__class__') else args.occ
+        occ_display = str(args.occ).split(".")[-1] if hasattr(args.occ, "__class__") else args.occ
         print(f"OCC Strategy:         {occ_display}")
 
     print("=" * 60)
     print("Starting simulation...")
     print()
 
+
 def init_xlb(args):
     xlb.init(
         velocity_set=args.velocity_set,
@@ -125,7 +141,7 @@ def init_xlb(args):
     )
     options = None
     if args.compute_backend == ComputeBackend.NEON:
-        neon_options = {'occ': args.occ, 'device_list': args.gpu_devices}
+        neon_options = {"occ": args.occ, "device_list": args.gpu_devices}
         options = neon_options
     return args.compute_backend, args.precision_policy, options
 
@@ -171,7 +187,7 @@ def run_simulation(compute_backend, precision_policy, grid_shape, num_steps, opt
         f_0, f_1 = stepper(f_0, f_1, bc_mask, missing_mask, omega, i)
         f_0, f_1 = f_1, f_0
     wp.synchronize()
-    export_num_steps =  warmup_iterations
+    export_num_steps = warmup_iterations
 
     elapsed_time_list = []
     for i in range(repetitions):
@@ -206,21 +222,21 @@ def run_simulation(compute_backend, precision_policy, grid_shape, num_steps, opt
 
 
 def calculate_mlups(cube_edge, num_steps, elapsed_time):
-    total_lattice_updates = cube_edge ** 3 * num_steps
+    total_lattice_updates = cube_edge**3 * num_steps
     mlups = (total_lattice_updates / elapsed_time) / 1e6
     return mlups
 
 
 def print_summary(args, elapsed_time, mlups):
     """Print comprehensive simulation summary with parameters and performance results"""
-    total_lattice_points = args.cube_edge ** 3
+    total_lattice_points = args.cube_edge**3
     total_lattice_updates = total_lattice_points * args.num_steps
     lattice_points_per_second = total_lattice_updates / elapsed_time
-    
+
     print("\n\n\n" + "=" * 70)
     print("                    SIMULATION SUMMARY")
     print("=" * 70)
-    
+
     # Simulation Parameters
     print("SIMULATION PARAMETERS:")
     print("-" * 25)
@@ -233,42 +249,42 @@ def print_summary(args, elapsed_time, mlups):
     print(f"  Velocity Set:           {args.velocity_set.__class__.__name__}")
     print(f"  Generate Report:        {'Yes' if args.report else 'No'}")
     print(f"  Measure Scalability:    {'Yes' if args.measure_scalability else 'No'}")
-    
+
     if args.compute_backend.name == "NEON":
         print(f"  GPU Devices:            {args.gpu_devices}")
-        occ_display = str(args.occ).split('.')[-1] if hasattr(args.occ, '__class__') else args.occ
+        occ_display = str(args.occ).split(".")[-1] if hasattr(args.occ, "__class__") else args.occ
         print(f"  OCC Strategy:           {occ_display}")
-    
+
     print()
-    
+
     # Performance Results
     print("PERFORMANCE RESULTS:")
     print("-" * 20)
     print(f"  Time in main loop:      {elapsed_time:.3f} seconds")
     print(f"  MLUPs:                  {mlups:.2f}")
-    print(f"  Time per LBM step:      {elapsed_time/args.num_steps*1000:.3f} ms")
-    
+    print(f"  Time per LBM step:      {elapsed_time / args.num_steps * 1000:.3f} ms")
+
     if args.compute_backend.name == "NEON" and len(args.gpu_devices) > 1:
         mlups_per_gpu = mlups / len(args.gpu_devices)
         print(f"  MLUPs per GPU:          {mlups_per_gpu:.2f}")
-    
+
     print("=" * 70)
 
 
 def print_summary_with_stats(args, stats):
     """Print comprehensive simulation summary with statistics from multiple repetitions"""
-    total_lattice_points = args.cube_edge ** 3
+    total_lattice_points = args.cube_edge**3
     total_lattice_updates = total_lattice_points * args.num_steps
-    
-    mean_mlups = stats['mean_mlups']
-    std_mlups = stats['std_dev_mlups']
-    mean_elapsed_time = stats['mean_elapsed_time']
-    std_elapsed_time = stats['std_dev_elapsed_time']
-    
+
+    mean_mlups = stats["mean_mlups"]
+    std_mlups = stats["std_dev_mlups"]
+    mean_elapsed_time = stats["mean_elapsed_time"]
+    std_elapsed_time = stats["std_dev_elapsed_time"]
+
     print("\n\n\n" + "=" * 70)
     print("                    SIMULATION SUMMARY")
     print("=" * 70)
-    
+
     # Simulation Parameters
     print("SIMULATION PARAMETERS:")
     print("-" * 25)
@@ -282,43 +298,43 @@ def print_summary_with_stats(args, stats):
     print(f"  Velocity Set:           {args.velocity_set.__class__.__name__}")
     print(f"  Generate Report:        {'Yes' if args.report else 'No'}")
     print(f"  Measure Scalability:    {'Yes' if args.measure_scalability else 'No'}")
-    
+
     if args.compute_backend.name == "NEON":
         print(f"  GPU Devices:            {args.gpu_devices}")
-        occ_display = str(args.occ).split('.')[-1] if hasattr(args.occ, '__class__') else args.occ
+        occ_display = str(args.occ).split(".")[-1] if hasattr(args.occ, "__class__") else args.occ
         print(f"  OCC Strategy:           {occ_display}")
-    
+
     print()
-    
+
     # Raw Data (if multiple repetitions)
     if args.repetitions > 1:
         print("RAW MEASUREMENT DATA:")
         print("-" * 21)
         print(f"{'Run':<6} {'Elapsed Time (s)':<18} {'MLUPs':<12} {'Time/Step (ms)':<15}")
         print("-" * 53)
-        
-        raw_elapsed_times = stats['raw_elapsed_times']
-        raw_mlups = stats['raw_mlups']
-        
+
+        raw_elapsed_times = stats["raw_elapsed_times"]
+        raw_mlups = stats["raw_mlups"]
+
         for i, (elapsed_time, mlups) in enumerate(zip(raw_elapsed_times, raw_mlups)):
             time_per_step = elapsed_time / args.num_steps * 1000
-            print(f"{i+1:<6} {elapsed_time:<18.3f} {mlups:<12.2f} {time_per_step:<15.3f}")
-        
+            print(f"{i + 1:<6} {elapsed_time:<18.3f} {mlups:<12.2f} {time_per_step:<15.3f}")
+
         print("-" * 53)
         print()
-    
+
     # Performance Results (Statistical Summary)
     print("PERFORMANCE RESULTS:")
     print("-" * 20)
     if args.repetitions > 1:
         print(f"  Time in main loop:      {mean_elapsed_time:.3f} ± {std_elapsed_time:.3f} seconds")
         print(f"  MLUPs:                  {mean_mlups:.2f} ± {std_mlups:.2f}")
-        print(f"  Time per LBM step:      {mean_elapsed_time/args.num_steps*1000:.3f} ± {std_elapsed_time/args.num_steps*1000:.3f} ms")
+        print(f"  Time per LBM step:      {mean_elapsed_time / args.num_steps * 1000:.3f} ± {std_elapsed_time / args.num_steps * 1000:.3f} ms")
     else:
         print(f"  Time in main loop:      {mean_elapsed_time:.3f} seconds")
         print(f"  MLUPs:                  {mean_mlups:.2f}")
-        print(f"  Time per LBM step:      {mean_elapsed_time/args.num_steps*1000:.3f} ms")
-    
+        print(f"  Time per LBM step:      {mean_elapsed_time / args.num_steps * 1000:.3f} ms")
+
     if args.compute_backend.name == "NEON" and len(args.gpu_devices) > 1:
         mlups_per_gpu = mean_mlups / len(args.gpu_devices)
         if args.repetitions > 1:
@@ -326,19 +342,19 @@ def print_summary_with_stats(args, stats):
             print(f"  MLUPs per GPU:          {mlups_per_gpu:.2f} ± {mlups_per_gpu_std:.2f}")
         else:
             print(f"  MLUPs per GPU:          {mlups_per_gpu:.2f}")
-    
+
     print("=" * 70)
 
 
 def print_scalability_summary(args, stats_list):
     """Print comprehensive scalability summary with MLUPs statistics for different GPU counts"""
-    total_lattice_points = args.cube_edge ** 3
+    total_lattice_points = args.cube_edge**3
     total_lattice_updates = total_lattice_points * args.num_steps
-    
+
     print("\n\n\n" + "=" * 95)
     print("                           SCALABILITY ANALYSIS")
     print("=" * 95)
-    
+
     # Simulation Parameters
     print("SIMULATION PARAMETERS:")
     print("-" * 25)
@@ -350,42 +366,42 @@ def print_scalability_summary(args, stats_list):
     print(f"  Compute Backend:        {args.compute_backend.name}")
     print(f"  Precision Policy:       {args.precision}")
     print(f"  Velocity Set:           {args.velocity_set.__class__.__name__}")
-    
+
     if args.compute_backend.name == "NEON":
-        occ_display = str(args.occ).split('.')[-1] if hasattr(args.occ, '__class__') else args.occ
+        occ_display = str(args.occ).split(".")[-1] if hasattr(args.occ, "__class__") else args.occ
         print(f"  OCC Strategy:           {occ_display}")
         print(f"  Available GPU Devices:  {args.gpu_devices}")
-    
+
     print()
-    
+
     # Extract mean MLUPs for calculations
-    mlups_means = [stats['mean_mlups'] for stats in stats_list]
+    mlups_means = [stats["mean_mlups"] for stats in stats_list]
     baseline_mlups = mlups_means[0] if mlups_means else 0
-    
+
     # Scalability Results
     print("SCALABILITY RESULTS:")
     print("-" * 20)
     print(f"{'GPUs':<6} {'MLUPs (mean±std)':<18} {'Speedup':<10} {'Efficiency':<12} {'MLUPs/GPU':<12}")
     print("-" * 68)
-    
+
     for i, stats in enumerate(stats_list):
         num_gpus = i + 1
-        mean_mlups = stats['mean_mlups']
-        std_mlups = stats['std_dev_mlups']
+        mean_mlups = stats["mean_mlups"]
+        std_mlups = stats["std_dev_mlups"]
         speedup = mean_mlups / baseline_mlups if baseline_mlups > 0 else 0
         efficiency = (speedup / num_gpus) if num_gpus > 0 else 0
         mlups_per_gpu = mean_mlups / num_gpus if num_gpus > 0 else 0
-        
+
         # Format MLUPs with standard deviation
         if args.repetitions > 1:
             mlups_str = f"{mean_mlups:.2f}±{std_mlups:.2f}"
         else:
             mlups_str = f"{mean_mlups:.2f}"
-        
+
         print(f"{num_gpus:<6} {mlups_str:<18} {speedup:<10.2f} {efficiency:<11.3f} {mlups_per_gpu:<12.2f}")
-    
+
     print("-" * 68)
-    
+
     # Summary Statistics
     if len(stats_list) > 1:
         max_mlups = max(mlups_means)
@@ -393,7 +409,7 @@ def print_scalability_summary(args, stats_list):
         max_speedup = max_mlups / baseline_mlups if baseline_mlups > 0 else 0
         best_efficiency_idx = 0
         best_efficiency = 0.0
-        
+
         for i, mean_mlups in enumerate(mlups_means):
             num_gpus = i + 1
             speedup = mean_mlups / baseline_mlups if baseline_mlups > 0 else 0
@@ -401,57 +417,60 @@ def print_scalability_summary(args, stats_list):
             if efficiency > best_efficiency:
                 best_efficiency = efficiency
                 best_efficiency_idx = i
-        
+
         print()
         print("SUMMARY STATISTICS:")
         print("-" * 19)
         print(f"  Best Performance:       {max_mlups:.2f} MLUPs ({max_mlups_idx + 1} GPUs)")
         if args.repetitions > 1:
-            max_std = stats_list[max_mlups_idx]['std_dev_mlups']
+            max_std = stats_list[max_mlups_idx]["std_dev_mlups"]
             print(f"  Performance Std Dev:    ±{max_std:.2f} MLUPs")
         print(f"  Maximum Speedup:        {max_speedup:.2f}x")
         print(f"  Best Efficiency:        {best_efficiency:.3f} ({best_efficiency_idx + 1} GPUs)")
         print(f"  Scalability Range:      1-{len(stats_list)} GPUs")
-    
+
     print("=" * 95)
 
 
 def report(args, stats):
     import neon
+
     report = neon.Report("LBM MLUPS LDC")
-    report.add_member('velocity_set', args.velocity_set.__class__.__name__)
-    report.add_member('compute_backend', args.compute_backend.name)
-    report.add_member('precision_policy', args.precision)
-    report.add_member('grid_size', args.cube_edge)
-    report.add_member('num_steps', args.num_steps)
-    report.add_member('repetitions', args.repetitions)
-    
+    report.add_member("velocity_set", args.velocity_set.__class__.__name__)
+    report.add_member("compute_backend", args.compute_backend.name)
+    report.add_member("precision_policy", args.precision)
+    report.add_member("grid_size", args.cube_edge)
+    report.add_member("num_steps", args.num_steps)
+    report.add_member("repetitions", args.repetitions)
+
     # Statistical measures
-    report.add_member('mean_elapsed_time', stats['mean_elapsed_time'])
-    report.add_member('mean_mlups', stats['mean_mlups'])
-    report.add_member('std_dev_elapsed_time', stats['std_dev_elapsed_time'])
-    report.add_member('std_dev_mlups', stats['std_dev_mlups'])
-    
+    report.add_member("mean_elapsed_time", stats["mean_elapsed_time"])
+    report.add_member("mean_mlups", stats["mean_mlups"])
+    report.add_member("std_dev_elapsed_time", stats["std_dev_elapsed_time"])
+    report.add_member("std_dev_mlups", stats["std_dev_mlups"])
+
     # Raw data vectors (if multiple repetitions)
     if args.repetitions > 1:
-        report.add_member_vector('raw_elapsed_times', stats['raw_elapsed_times'])
-        report.add_member_vector('raw_mlups', stats['raw_mlups'])
-    
+        report.add_member_vector("raw_elapsed_times", stats["raw_elapsed_times"])
+        report.add_member_vector("raw_mlups", stats["raw_mlups"])
+
     # Legacy fields for backwards compatibility
-    report.add_member('elapsed_time', stats['mean_elapsed_time'])
-    report.add_member('mlups', stats['mean_mlups'])
-    
-    report.add_member('occ', (args.occ.to_string() ))
-    report.add_member_vector('gpu_devices', args.gpu_devices)
-    report.add_member('num_devices', len(args.gpu_devices))
-    report.add_member('measure_scalability', args.measure_scalability)
-
-    report_name = 'mlups_3d_'+f'size_{args.cube_edge}'
+    report.add_member("elapsed_time", stats["mean_elapsed_time"])
+    report.add_member("mlups", stats["mean_mlups"])
+
+    report.add_member("occ", (args.occ.to_string()))
+    report.add_member_vector("gpu_devices", args.gpu_devices)
+    report.add_member("num_devices", len(args.gpu_devices))
+    report.add_member("measure_scalability", args.measure_scalability)
+
+    report_name = "mlups_3d_" + f"size_{args.cube_edge}"
     if args.measure_scalability:
-        report_name += f'_dev_{len(args.gpu_devices)}'
+        report_name += f"_dev_{len(args.gpu_devices)}"
     if args.repetitions > 1:
-        report_name += f'_rep_{args.repetitions}'
+        report_name += f"_rep_{args.repetitions}"
     report.write(report_name, True)
+
+
 # -------------------------- Simulation Loop --------------------------
 
 
@@ -461,26 +480,35 @@ def benchmark(args):
 
     elapsed_time_list = []
     mlups_list = []
-    elapsed_time_list = run_simulation(compute_backend=compute_backend, 
-                                precision_policy=precision_policy, 
-                                grid_shape=grid_shape, 
-                                num_steps=args.num_steps, 
-                                options=options,
-                                export_final_velocity=args.export_final_velocity,
-                                repetitions=args.repetitions,
-                                num_devices=len(args.gpu_devices))
-    
+    elapsed_time_list = run_simulation(
+        compute_backend=compute_backend,
+        precision_policy=precision_policy,
+        grid_shape=grid_shape,
+        num_steps=args.num_steps,
+        options=options,
+        export_final_velocity=args.export_final_velocity,
+        repetitions=args.repetitions,
+        num_devices=len(args.gpu_devices),
+    )
+
     for elapsed_time in elapsed_time_list:
         mlups = calculate_mlups(args.cube_edge, args.num_steps, elapsed_time)
         mlups_list.append(mlups)
 
-
     mean_mlups = np.mean(mlups_list)
     std_dev_mlups = np.std(mlups_list)
     mean_elapsed_time = np.mean(elapsed_time_list)
     std_dev_elapsed_time = np.std(elapsed_time_list)
 
-    stats = {'mean_mlups': mean_mlups, 'std_dev_mlups': std_dev_mlups, 'mean_elapsed_time': mean_elapsed_time, 'std_dev_elapsed_time': std_dev_elapsed_time, 'num_devices': len(args.gpu_devices), 'raw_mlups': mlups_list, 'raw_elapsed_times': elapsed_time_list}
+    stats = {
+        "mean_mlups": mean_mlups,
+        "std_dev_mlups": std_dev_mlups,
+        "mean_elapsed_time": mean_elapsed_time,
+        "std_dev_elapsed_time": std_dev_elapsed_time,
+        "num_devices": len(args.gpu_devices),
+        "raw_mlups": mlups_list,
+        "raw_elapsed_times": elapsed_time_list,
+    }
     # Generate report if requested
     if args.report:
         report(args, stats)
@@ -488,6 +516,7 @@ def benchmark(args):
 
     return stats
 
+
 def main():
     args = parse_arguments()
     if not args.measure_scalability:
@@ -499,6 +528,7 @@ def main():
     stats_list = []
     for num_devices in range(1, len(args.gpu_devices) + 1):
         import copy
+
         args_copy = copy.deepcopy(args)
         args_copy.gpu_devices = args_copy.gpu_devices[:num_devices]
         stats = benchmark(args_copy)
diff --git a/xlb/default_config.py b/xlb/default_config.py
index a57c1477..f709f0d5 100644
--- a/xlb/default_config.py
+++ b/xlb/default_config.py
@@ -48,6 +48,7 @@ def default_backend() -> ComputeBackend:
 
 def check_backend_support():
     import jax
+
     if jax.devices()[0].platform == "gpu":
         gpus = jax.devices("gpu")
         if len(gpus) > 1:
diff --git a/xlb/grid/neon_grid.py b/xlb/grid/neon_grid.py
index b572a4d9..36ae8ed2 100644
--- a/xlb/grid/neon_grid.py
+++ b/xlb/grid/neon_grid.py
@@ -8,29 +8,30 @@
 
 
 class NeonGrid(Grid):
-    def __init__(self,
-                 shape,  # bounding box of the domain
-                 velocity_set,  # velocity set for the grid
-                 backend_config = None,
-                 ):
+    def __init__(
+        self,
+        shape,  # bounding box of the domain
+        velocity_set,  # velocity set for the grid
+        backend_config=None,
+    ):
         from .warp_grid import WarpGrid
 
         if backend_config is None:
             backend_config = {
-                'device_list': [0],
-                'skeleton_config': neon.SkeletonConfig.none(),
+                "device_list": [0],
+                "skeleton_config": neon.SkeletonConfig.none(),
             }
 
         # check that the config dictionary has the required keys
-        required_keys = ['device_list']
+        required_keys = ["device_list"]
         for key in required_keys:
             if key not in backend_config:
                 raise ValueError(f"backend_config must contain a '{key}' key")
-        
-        #check that the device list is a list of integers
-        if not isinstance(backend_config['device_list'], list):
+
+        # check that the device list is a list of integers
+        if not isinstance(backend_config["device_list"], list):
             raise ValueError(f"backend_config['device_list'] must be a list of integers")
-        for device in backend_config['device_list']:
+        for device in backend_config["device_list"]:
             if not isinstance(device, int):
                 raise ValueError(f"backend_config['device_list'] must be a list of integers")
 
@@ -46,9 +47,9 @@ def __init__(self,
     def _get_velocity_set(self):
         return self.velocity_set
 
-    def _initialize_backend(self):     
-        dev_idx_list = self.config['device_list']
-        
+    def _initialize_backend(self):
+        dev_idx_list = self.config["device_list"]
+
         if len(self.shape) == 2:
             import py_neon
 
@@ -72,10 +73,10 @@ def _initialize_backend(self):
         pass
 
     def create_field(
-            self,
-            cardinality: int,
-            dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None,
-            fill_value=None,
+        self,
+        cardinality: int,
+        dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None,
+        fill_value=None,
     ):
         dtype = dtype.wp_dtype if dtype else DefaultConfig.default_precision_policy.store_precision.wp_dtype
         field = self.grid.new_field(
@@ -90,8 +91,7 @@ def create_field(
         return field
 
     def _create_warp_field(
-            self, cardinality: int, dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None,
-            fill_value=None, ne_field=None
+        self, cardinality: int, dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None, fill_value=None, ne_field=None
     ):
         warp_field = self.warp_grid.create_field(cardinality, dtype, fill_value)
         if ne_field is None:
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 83c28e3a..22bde94d 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -433,6 +433,7 @@ def domain_bounds_launcher(loader: neon.Loader):
                     wp_bc_indices = wp_bc_indices_[dev_idx]
                     wp_id_numbers = wp_id_numbers_[dev_idx]
                     wp_is_interior = wp_is_interior_[dev_idx]
+
                 @wp.func
                 def domain_bounds_kernel(index: Any):
                     # apply the functional
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 17acbfe0..1acef90e 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -693,7 +693,7 @@ def finest_fused_pull(
                 # throw an exception
                 raise Exception("Only the finest level is supported for now")
             grid = f_0_fd.get_grid()
-            num_levels =grid.num_levels
+            num_levels = grid.num_levels
 
             # if level != 0:
             #     # throw an exception
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 9d44cf8f..e32b7d2c 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -120,7 +120,7 @@ def _process_boundary_conditions(self, boundary_conditions, f_1, bc_mask, missin
             velocity_set=DefaultConfig.velocity_set,
             precision_policy=DefaultConfig.default_precision_policy,
             compute_backend=DefaultConfig.default_backend,
-            grid=self.grid
+            grid=self.grid,
         )
 
         # Split boundary conditions by type
@@ -472,10 +472,10 @@ def nse_stepper_ll(loader: neon.Loader):
                 loader.set_grid(bc_mask_fd.get_grid())
 
                 f_0_pn = loader.get_read_handle(
-                    f_0_fd, 
-                    operation=neon.Loader.Operation.stencil, 
-                    discretization = neon.Loader.Discretization.lattice,
-                    )
+                    f_0_fd,
+                    operation=neon.Loader.Operation.stencil,
+                    discretization=neon.Loader.Discretization.lattice,
+                )
                 bc_mask_pn = loader.get_read_handle(bc_mask_fd)
                 missing_mask_pn = loader.get_read_handle(missing_mask_fd)
 
@@ -526,27 +526,23 @@ def neon_launch(self, f_0, f_1, bc_mask, missing_mask, omega, timestep):
     def prepare_skeleton(self, f_0, f_1, bc_mask, missing_mask, omega):
         grid = f_0.get_grid()
         bk = grid.backend
-        self.neon_skeleton = {'odd': {}, 'even': {}}
-        self.neon_skeleton['odd']['container'] = self.neon_container(f_0, f_1, bc_mask, missing_mask, omega, 0)
-        self.neon_skeleton['even']['container'] = self.neon_container(f_1, f_0, bc_mask, missing_mask, omega, 1)
+        self.neon_skeleton = {"odd": {}, "even": {}}
+        self.neon_skeleton["odd"]["container"] = self.neon_container(f_0, f_1, bc_mask, missing_mask, omega, 0)
+        self.neon_skeleton["even"]["container"] = self.neon_container(f_1, f_0, bc_mask, missing_mask, omega, 1)
         # check if 'occ' is a valid key
-        if 'occ' not in self.backend_config:
+        if "occ" not in self.backend_config:
             occ = neon.SkeletonConfig.none()
         else:
-            occ = self.backend_config['occ']
+            occ = self.backend_config["occ"]
             # check that occ is of type neon.SkeletonConfig.OCC
             if not isinstance(occ, neon.SkeletonConfig.OCC):
                 print(type(occ))
                 raise ValueError("occ must be of type neon.SkeletonConfig.OCC")
-        
+
         for key in self.neon_skeleton:
-            self.neon_skeleton[key]['app'] = [self.neon_skeleton[key]['container']]
-            self.neon_skeleton[key]['skeleton'] = neon.Skeleton(backend=bk)
-            self.neon_skeleton[key]['skeleton'].sequence(
-                name = "mres_nse_stepper", 
-                containers=self.neon_skeleton[key]['app'],
-                occ = occ)
-
-        self.sk = [self.neon_skeleton['odd']['skeleton'],
-                   self.neon_skeleton['even']['skeleton']]
+            self.neon_skeleton[key]["app"] = [self.neon_skeleton[key]["container"]]
+            self.neon_skeleton[key]["skeleton"] = neon.Skeleton(backend=bk)
+            self.neon_skeleton[key]["skeleton"].sequence(name="mres_nse_stepper", containers=self.neon_skeleton[key]["app"], occ=occ)
+
+        self.sk = [self.neon_skeleton["odd"]["skeleton"], self.neon_skeleton["even"]["skeleton"]]
         self.sk_iter = 0
diff --git a/xlb/precision_policy.py b/xlb/precision_policy.py
index f82745a3..39e3e096 100644
--- a/xlb/precision_policy.py
+++ b/xlb/precision_policy.py
@@ -3,7 +3,6 @@
 from enum import Enum, auto
 
 
-
 class Precision(Enum):
     FP64 = auto()
     FP32 = auto()
@@ -14,6 +13,7 @@ class Precision(Enum):
     @property
     def wp_dtype(self):
         import warp as wp
+
         if self == Precision.FP64:
             return wp.float64
         elif self == Precision.FP32:
@@ -84,10 +84,12 @@ def store_precision(self):
 
     def cast_to_compute_jax(self, array):
         import jax.numpy as jnp
+
         compute_precision = self.compute_precision
         return jnp.array(array, dtype=compute_precision.jax_dtype)
 
     def cast_to_store_jax(self, array):
         import jax.numpy as jnp
+
         store_precision = self.store_precision
         return jnp.array(array, dtype=store_precision.jax_dtype)
diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index d0cd5dd1..907b842f 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -445,7 +445,7 @@ def get_fields_data(self, field_neon_dict):
 
         # Ensure that this operator is called on multires grids
         grid_mres = next(iter(field_neon_dict.values())).get_grid()
-        assert grid_mres.name== "mGrid", f"Operation {self.__class__.__name} is only applicable to multi-resolution cases"
+        assert grid_mres.name == "mGrid", f"Operation {self.__class__.__name} is only applicable to multi-resolution cases"
 
         for field_name in field_neon_dict.keys():
             assert field_name in self.field_name_cardinality_dict.keys(), (

From e09c54dfdfa301af75bcfe5f086794f8cbc22b95 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 2 Sep 2025 13:55:10 -0400
Subject: [PATCH 169/208] corrected interpolated bounceback method to achieve
 better results when applying hybridBC as inlet

---
 .../cfd/grid_refinement/cuboid_flow_past_sphere_3d.py     | 8 +++++++-
 xlb/operator/boundary_condition/helper_functions_bc.py    | 7 ++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
index 355c11e0..02e08291 100644
--- a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
@@ -174,7 +174,10 @@ def bc_profile():
     H_y = dtype(ny // 2 ** (num_levels - 1) - 1)  # Height in y direction
     H_z = dtype(nz // 2 ** (num_levels - 1) - 1)  # Height in z direction
     two = dtype(2.0)
+    one = dtype(1.0)
+    zero = dtype(0.0)
     u_max_wp = dtype(u_max)
+    _u_vec = wp.vec(velocity_set.d, dtype=dtype)
 
     @wp.func
     def bc_profile_warp(index: wp.vec3i):
@@ -188,7 +191,9 @@ def bc_profile_warp(index: wp.vec3i):
         r_squared = (two * y_center / H_y) ** two + (two * z_center / H_z) ** two
 
         # Parabolic profile: u = u_max * (1 - r²)
-        return wp.vec(u_max_wp * wp.max(dtype(0.0), dtype(1.0) - r_squared), length=1)
+        # Note that unlike RegularizedBC and ZouHeBC which only accept normal velocity, hybridBC accepts the full velocity vector
+        # return _u_vec(u_max_wp * wp.max(zero, one - r_squared), zero, zero)
+        return wp.vec(u_max_wp * wp.max(zero, one - r_squared), length=1)
 
     return bc_profile_warp
 
@@ -200,6 +205,7 @@ def bc_profile_warp(index: wp.vec3i):
 
 # Initialize Boundary Conditions
 bc_left = RegularizedBC("velocity", profile=bc_profile(), indices=inlet)
+# bc_left = HybridBC(bc_method="bounceback_regularized", profile=bc_profile(), indices=inlet)
 # Alternatively, use a prescribed velocity profile
 # bc_left = RegularizedBC("velocity", prescribed_value=(u_max, 0.0, 0.0), indices=inlet)
 bc_walls = FullwayBounceBackBC(indices=walls)  # TODO: issues with halfway bounce back only here!
diff --git a/xlb/operator/boundary_condition/helper_functions_bc.py b/xlb/operator/boundary_condition/helper_functions_bc.py
index 0983b646..3bb3967f 100644
--- a/xlb/operator/boundary_condition/helper_functions_bc.py
+++ b/xlb/operator/boundary_condition/helper_functions_bc.py
@@ -251,13 +251,14 @@ def interpolated_bounceback(
                     if needs_mesh_distance:
                         # use weights associated with curved boundaries that are properly stored in f_1.
                         weight = compute_dtype(self.distance_decoder_function(f_1, index, l))
-                    else:
-                        weight = compute_dtype(0.5)
 
-                    if _missing_mask[_opp_indices[l]] == wp.uint8(0):
                         # Use differentiable interpolated BB to find f_missing:
                         f_post[l] = ((one - weight) * f_post[_opp_indices[l]] + weight * (f_pre[l] + f_pre[_opp_indices[l]])) / (one + weight)
                     else:
+                        # Use regular halfway bounceback
+                        f_post[l] = f_pre[_opp_indices[l]]
+
+                    if _missing_mask[_opp_indices[l]] == wp.uint8(1):
                         # These are cases where the boundary is sandwiched between 2 solid cells and so both opposite directions are missing.
                         f_post[l] = f_pre[_opp_indices[l]]
 

From ecd4103de01529117ae09a373d2bb4c4f35cfaae Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Wed, 3 Sep 2025 13:48:10 +0200
Subject: [PATCH 170/208] refactoring(PR): applying changes base on the review
 from PR #18

---
 examples/performance/mlups_3d.py              |  9 +++-
 .../indices_boundary_masker.py                | 42 +++++++------------
 xlb/operator/stepper/nse_stepper.py           |  1 -
 3 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/examples/performance/mlups_3d.py b/examples/performance/mlups_3d.py
index 28f55323..2fd2174a 100644
--- a/examples/performance/mlups_3d.py
+++ b/examples/performance/mlups_3d.py
@@ -102,6 +102,12 @@ def parse_arguments():
         velocity_set = xlb.velocity_set.D3Q27(precision_policy=args.precision_policy, compute_backend=compute_backend)
     args.velocity_set = velocity_set
 
+    if args.gpu_devices is not None and args.compute_backend != ComputeBackend.NEON:
+        raise ValueError("--gpu_devices can be used only with the Neon backend.")
+
+    if args.gpu_devices is None:
+        args.gpu_devices = [0]
+
     print_args(args)
 
     return args
@@ -120,12 +126,13 @@ def print_args(args):
     print(f"Velocity Set:         {args.velocity_set.__class__.__name__}")
     print(f"Generate Report:      {'Yes' if args.report else 'No'}")
     print(f"Measure Scalability:  {'Yes' if args.measure_scalability else 'No'}")
+    print(f"Export Velocity:      {'Yes' if args.export_final_velocity else 'No'}")
     print(f"Repetitions:          {args.repetitions}")
 
     if args.compute_backend.name == "NEON":
         print(f"GPU Devices:          {args.gpu_devices}")
         # Convert the neon OCC enum back to string for display
-        occ_display = str(args.occ).split(".")[-1] if hasattr(args.occ, "__class__") else args.occ
+        occ_display = args.occ.to_string() if hasattr(args.occ, "__class__") else args.occ
         print(f"OCC Strategy:         {occ_display}")
 
     print("=" * 60)
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 22bde94d..55a1bb3d 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -329,17 +329,19 @@ def _prepare_kernel_inputs(self, bclist, grid_shape):
         is_interior = is_interior[:total_index]
 
         # Convert to Warp arrays
+        def _to_wp_arrays(indices, id_numbers, is_interior, device=None):
+            return (
+                wp.array(indices, dtype=wp.int32, device=device),
+                wp.array(id_numbers, dtype=wp.uint8, device=device),
+                wp.array(is_interior, dtype=wp.uint8, device=device),
+            )
+
         if self.compute_backend == ComputeBackend.NEON:
             grid = self.grid
-            if grid is None:
-                ndevice = 1
-            else:
-                ndevice = grid.bk.get_num_devices()
+            ndevice = 1 if grid is None else grid.bk.get_num_devices()
+
             if ndevice == 1:
-                wp_bc_indices = wp.array(indices, dtype=wp.int32)
-                wp_id_numbers = wp.array(id_numbers, dtype=wp.uint8)
-                wp_is_interior = wp.array(is_interior, dtype=wp.uint8)
-                return wp_bc_indices, wp_id_numbers, wp_is_interior
+                return _to_wp_arrays(indices, id_numbers, is_interior)
             else:
                 # For multi-device, we need to split the indices across devices
                 wp_bc_indices = []
@@ -352,11 +354,7 @@ def _prepare_kernel_inputs(self, bclist, grid_shape):
                     wp_is_interior.append(wp.array(is_interior, dtype=wp.uint8, device=device_name))
                 return wp_bc_indices, wp_id_numbers, wp_is_interior
         else:
-            # Convert to Warp arrays
-            wp_bc_indices = wp.array(indices, dtype=wp.int32)
-            wp_id_numbers = wp.array(id_numbers, dtype=wp.uint8)
-            wp_is_interior = wp.array(is_interior, dtype=wp.uint8)
-            return wp_bc_indices, wp_id_numbers, wp_is_interior
+            return _to_wp_arrays(indices, id_numbers, is_interior)
 
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
@@ -429,10 +427,10 @@ def domain_bounds_launcher(loader: neon.Loader):
                     wp_id_numbers = wp_id_numbers_
                     wp_is_interior = wp_is_interior_
                 else:
-                    dev_idx = loader.get_device_id()
-                    wp_bc_indices = wp_bc_indices_[dev_idx]
-                    wp_id_numbers = wp_id_numbers_[dev_idx]
-                    wp_is_interior = wp_is_interior_[dev_idx]
+                    device_id = loader.get_device_id()
+                    wp_bc_indices = wp_bc_indices_[device_id]
+                    wp_id_numbers = wp_id_numbers_[device_id]
+                    wp_is_interior = wp_is_interior_[device_id]
 
                 @wp.func
                 def domain_bounds_kernel(index: Any):
@@ -534,11 +532,6 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
 
         # If there are no interior boundary conditions, skip the rest and retun early
         if not bc_interior:
-            # wp.synchronize()
-            # bc_mask.update_host(0)
-            # wp.synchronize()
-            # bc_mask.export_vti("bc_mask.vti", "m")
-            # wp.synchronize()
             return bc_mask, missing_mask
 
         # Prepare the second and third kernel inputs for only a subset of boundary conditions associated with the interior
@@ -556,9 +549,4 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
         )
         container_interior_bc_mask.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
 
-        # wp.synchronize()
-        # bc_mask.update_host(0)
-        # wp.synchronize()
-        # bc_mask.export_vti(f"{"bc_mask"}.vti", "u")
-
         return bc_mask, missing_mask
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index e32b7d2c..7abcf861 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -129,7 +129,6 @@ def _process_boundary_conditions(self, boundary_conditions, f_1, bc_mask, missin
 
         # Process indices-based boundary conditions
         if bc_with_indices:
-            grid = self.get_grid()
             bc_mask, missing_mask = indices_masker(bc_with_indices, bc_mask, missing_mask)
 
         # Process mesh-based boundary conditions for 3D

From 4926d7584531336f15043bdc2c36466ae4ab5482 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Thu, 4 Sep 2025 11:16:35 -0400
Subject: [PATCH 171/208] minor refactoring

---
 .../cuboid_flow_past_sphere_3d.py             |  4 +++
 .../boundary_condition/helper_functions_bc.py | 26 +++++++++----------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
index 02e08291..9e8f71f3 100644
--- a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
@@ -192,7 +192,11 @@ def bc_profile_warp(index: wp.vec3i):
 
         # Parabolic profile: u = u_max * (1 - r²)
         # Note that unlike RegularizedBC and ZouHeBC which only accept normal velocity, hybridBC accepts the full velocity vector
+
+        # For hybridBC
         # return _u_vec(u_max_wp * wp.max(zero, one - r_squared), zero, zero)
+
+        # For Regularized and ZouHe
         return wp.vec(u_max_wp * wp.max(zero, one - r_squared), length=1)
 
     return bc_profile_warp
diff --git a/xlb/operator/boundary_condition/helper_functions_bc.py b/xlb/operator/boundary_condition/helper_functions_bc.py
index 3bb3967f..1801247f 100644
--- a/xlb/operator/boundary_condition/helper_functions_bc.py
+++ b/xlb/operator/boundary_condition/helper_functions_bc.py
@@ -401,22 +401,21 @@ def encoder_functional(
                 wp.printf("Error: User-defined profile must return a vector of size %d\n", _num_of_aux_data)
                 return
 
-            # Write the result for all q directions, but only store up to num_of_aux_data
+            # Write the result for all q directions, but only store up to _num_of_aux_data
             counter = wp.int32(0)
             for l in range(self.velocity_set.q):
-                # wp.neon_write(f_pn, index, l, self.store_dtype(feq[l]))
+                # Only store up to _num_of_aux_data
+                if counter == _num_of_aux_data:
+                    return
+
                 if l == lattice_central_index:
                     # The first BC auxiliary data is stored in the zero'th index of f_1 associated with its center.
                     self.write_field(field_storage, index, l, self.store_dtype(prescribed_values[l]))
                     counter += 1
-                elif _missing_mask[l] == wp.uint8(1) and counter <= _num_of_aux_data:
+                elif _missing_mask[l] == wp.uint8(1):
                     # The other remaining BC auxiliary data are stored in missing directions of f_1.
-                    # Only store up to num_of_aux_data
                     self.write_field(field_storage, index, _opp_indices[l], self.store_dtype(prescribed_values[l]))
                     counter += 1
-                if counter > _num_of_aux_data:
-                    # Only store up to num_of_aux_data
-                    return
 
         @wp.func
         def decoder_functional(
@@ -431,24 +430,23 @@ def decoder_functional(
             # Define a vector to hold prescribed_values
             prescribed_values = _aux_vec()
 
-            # Read all q directions, but only retrieve up to num_of_aux_data
+            # Read all q directions, but only retrieve up to _num_of_aux_data
             counter = wp.int32(0)
             for l in range(self.velocity_set.q):
-                # wp.neon_write(f_pn, index, l, self.store_dtype(feq[l]))
+                # Only retrieve up to _num_of_aux_data
+                if counter == _num_of_aux_data:
+                    return prescribed_values
+
                 if l == lattice_central_index:
                     # The first BC auxiliary data is stored in the zero'th index of f_1 associated with its center.
                     value = self.read_field(field_storage, index, l)
                     prescribed_values[counter] = self.compute_dtype(value)
                     counter += 1
-                elif _missing_mask[l] == wp.uint8(1) and counter <= _num_of_aux_data:
+                elif _missing_mask[l] == wp.uint8(1):
                     # The other remaining BC auxiliary data are stored in missing directions of f_1.
-                    # Only store up to num_of_aux_data
                     value = self.read_field(field_storage, index, _opp_indices[l])
                     prescribed_values[counter] = self.compute_dtype(value)
                     counter += 1
-                if counter > _num_of_aux_data:
-                    # Only retrieve up to num_of_aux_data
-                    return prescribed_values
 
         # Construct the warp kernel
         @wp.kernel

From ffa0c49598a896bf37171c3892462bc62f2ec854 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Thu, 4 Sep 2025 17:23:17 -0400
Subject: [PATCH 172/208] enabled handling or user-defined profiles that span
 across multiple levels

---
 .../cfd/grid_refinement/cuboid_flow_past_sphere_3d.py    | 7 +++----
 xlb/operator/boundary_condition/helper_functions_bc.py   | 9 ++++-----
 xlb/operator/boundary_masker/helper_functions_masker.py  | 2 +-
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
index 9e8f71f3..35770231 100644
--- a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
@@ -167,12 +167,11 @@ def prepare_sparsity_pattern(level_data):
 # Define Boundary Conditions
 def bc_profile():
     assert compute_backend == ComputeBackend.NEON
-
-    # Note nx, ny, nz are the dimensions of the grid at the finest level while the inlet is defined at the coarsest level
+    # IMPORTANT NOTE: the user defined functional must be defined in terms of the indices at the finest level
     _, ny, nz = grid_shape_finest
     dtype = precision_policy.compute_precision.wp_dtype
-    H_y = dtype(ny // 2 ** (num_levels - 1) - 1)  # Height in y direction
-    H_z = dtype(nz // 2 ** (num_levels - 1) - 1)  # Height in z direction
+    H_y = dtype(ny)  # Length in y direction (finest level)
+    H_z = dtype(nz)  # Length in z direction (finest level)
     two = dtype(2.0)
     one = dtype(1.0)
     zero = dtype(0.0)
diff --git a/xlb/operator/boundary_condition/helper_functions_bc.py b/xlb/operator/boundary_condition/helper_functions_bc.py
index 1801247f..1b5f6625 100644
--- a/xlb/operator/boundary_condition/helper_functions_bc.py
+++ b/xlb/operator/boundary_condition/helper_functions_bc.py
@@ -317,7 +317,7 @@ def interpolated_nonequilibrium_bounceback(
 
         @wp.func
         def neon_index_to_warp(neon_field_hdl: Any, index: Any):
-            # Unpack the global index in Neon
+            # Unpack the global index in Neon at the finest level and convert it to a warp vector
             cIdx = wp.neon_global_idx(neon_field_hdl, index)
             gx = wp.neon_get_x(cIdx)
             gy = wp.neon_get_y(cIdx)
@@ -584,9 +584,6 @@ def aux_data_init_ll(loader: neon.Loader):
                 bc_mask_pn = loader.get_mres_read_handle(bc_mask)
                 missing_mask_pn = loader.get_mres_read_handle(missing_mask)
 
-                # Get the refinement factor for the current level
-                refinement = 2**level
-
                 @wp.func
                 def aux_data_init_cl(index: Any):
                     # read tid data
@@ -594,8 +591,10 @@ def aux_data_init_cl(index: Any):
 
                     # Apply the functional
                     if _boundary_id == _id:
+                        # IMPORTANT NOTE:
+                        # It is assumed in XLB that the user_defined_functional in multi-res simulations is defined in terms of the indices at the finest level.
+                        # This assumption enables handling of BCs whose indices span multiple levels
                         warp_index = self.bc_helper.neon_index_to_warp(f_1_pn, index)
-                        warp_index /= refinement
                         prescribed_values = self.user_defined_functional(warp_index)
 
                         # Call the functional
diff --git a/xlb/operator/boundary_masker/helper_functions_masker.py b/xlb/operator/boundary_masker/helper_functions_masker.py
index f34a8ad5..470f2362 100644
--- a/xlb/operator/boundary_masker/helper_functions_masker.py
+++ b/xlb/operator/boundary_masker/helper_functions_masker.py
@@ -23,7 +23,7 @@ def __init__(self, velocity_set=None, precision_policy=None, compute_backend=Non
 
         @wp.func
         def neon_index_to_warp(neon_field_hdl: Any, index: Any):
-            # Unpack the global index in Neon
+            # Unpack the global index in Neon at the finest level and convert it to a warp vector
             cIdx = wp.neon_global_idx(neon_field_hdl, index)
             gx = wp.neon_get_x(cIdx)
             gy = wp.neon_get_y(cIdx)

From 9d83c1ba9548468fb2b4800539bb3dd62603f62f Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 5 Sep 2025 11:17:20 -0400
Subject: [PATCH 173/208] This commit addresses the issue discussed in
 https://github.com/hsalehipour/XLB/issues/20

---
 xlb/operator/boundary_condition/bc_hybrid.py | 64 +++++---------------
 1 file changed, 14 insertions(+), 50 deletions(-)

diff --git a/xlb/operator/boundary_condition/bc_hybrid.py b/xlb/operator/boundary_condition/bc_hybrid.py
index b53c9cf7..fd717fe3 100644
--- a/xlb/operator/boundary_condition/bc_hybrid.py
+++ b/xlb/operator/boundary_condition/bc_hybrid.py
@@ -81,9 +81,6 @@ def __init__(
             distance_decoder_function=self._construct_distance_decoder_function(),
         )
 
-        # A flag to track if available space in "f_1" for storing auxiliary data is full
-        self.auxiliary_storage_space_full = False
-
         # A flag to enable moving wall treatment when either "prescribed_value" or "profile" are provided.
         self.needs_moving_wall_treatment = False
 
@@ -129,9 +126,6 @@ def prescribed_profile_warp(index: Any):
             # a wrapper function that also accepts time as a parameter.
             self.is_time_dependent = True
 
-            # For time dependent prescribed values, we cannot store at initialization
-            self.auxiliary_storage_space_full = True
-
         # This BC class accepts both constant prescribed values of velocity with keyword "prescribed_value" or
         # velocity profiles given by keyword "profile" which must be a callable function.
         self.profile = profile
@@ -143,7 +137,6 @@ def prescribed_profile_warp(index: Any):
         if self.needs_mesh_distance:
             # This BC needs auxiliary data recovery after streaming
             self.needs_aux_recovery = True
-            self.auxiliary_storage_space_full = True
 
         # If this BC is defined using indices, it would need padding in order to find missing directions
         # when imposed on a geometry that is in the domain interior
@@ -155,36 +148,8 @@ def prescribed_profile_warp(index: Any):
         else:
             assert self.indices is None, "Cannot use indices with mesh vertices! Please provide mesh vertices only."
 
-        if not self.auxiliary_storage_space_full:
-            # In the following two cases we simply call the user-defined profile warp function
-            # (i)  mesh distance data are already stored in f_1
-            # (ii) the user-defined functional is time-dependent and cannot be stored only once during initialization
-            # This BC needs auxiliary data initialization before streaming
-            self.needs_aux_init = True
-
-            # This BC needs auxiliary data recovery after streaming
-            self.needs_aux_recovery = True
-
-            # This BC needs one auxiliary data for the density or normal velocity
-            # The user prescribed function for velocity profile (eg. rotating velocity) can be stored and retrived using f_1
-            self.num_of_aux_data = 3
-
-            # Create the encoder operator for storing the auxiliary data
-            encode_auxiliary_data = EncodeAuxiliaryData(
-                self.id,
-                self.num_of_aux_data,
-                self.profile,
-                velocity_set=self.velocity_set,
-                precision_policy=self.precision_policy,
-                compute_backend=self.compute_backend,
-            )
-
-            # Get auxiliary decoder functional
-            functional_dict, _ = encode_auxiliary_data._construct_warp()
-            self.decoder_functional = functional_dict["decoder"]
-
-        # Define the profile decoder functional
-        self.profile_decoder_functional = self._construct_profile_decoder_functional()
+        # Define the profile functional
+        self.profile_functional = self._construct_profile_functional()
 
     @Operator.register_backend(ComputeBackend.JAX)
     @partial(jit, static_argnums=(0))
@@ -205,9 +170,9 @@ def distance_decoder_function(f_1: Any, index: Any, direction: Any):
 
         return distance_decoder_function
 
-    def _construct_profile_decoder_functional(self):
+    def _construct_profile_functional(self):
         """
-        Get the profile decoder functional for this BC.
+        Get the profile functional for this BC.
         Note:
         We can impose a profile on a boundary which requires mesh-distance only if that boundary lives on the finest level.
         This is because I don't know how to extract "level" from the
@@ -218,17 +183,16 @@ def _construct_profile_decoder_functional(self):
                gz = wp.neon_get_z(cIdx) // 2 ** level
         """
 
-        # Get decoder functional
         @wp.func
-        def decoder_functional(f_1: Any, index: Any, timestep: Any, _missing_mask: Any):
-            if wp.static(not self.auxiliary_storage_space_full):
-                return self.decoder_functional(f_1, index, _missing_mask)
-            elif wp.static(self.is_time_dependent):
-                return self.profile(index, timestep)
+        def profile_functional(f_1: Any, index: Any, timestep: Any):
+            # Convert neon index to warp index
+            warp_index = self.bc_helper.neon_index_to_warp(f_1, index)
+            if wp.static(self.is_time_dependent):
+                return self.profile(warp_index, timestep)
             else:
-                return self.profile(index)
+                return self.profile(warp_index)
 
-        return decoder_functional
+        return profile_functional
 
     def _construct_warp(self):
         # Construct the functionals for this BC
@@ -250,7 +214,7 @@ def hybrid_bounceback_regularized(
             #     in: 41st aerospace sciences meeting and exhibit, p. 953.
 
             # Apply interpolated bounceback first to find missing populations at the boundary
-            u_wall = self.profile_decoder_functional(f_1, index, timestep, _missing_mask)
+            u_wall = self.profile_functional(f_1, index, timestep)
             f_post = self.bc_helper.interpolated_bounceback(
                 index,
                 _missing_mask,
@@ -289,7 +253,7 @@ def hybrid_bounceback_grads(
             #     in: 41st aerospace sciences meeting and exhibit, p. 953.
 
             # Apply interpolated bounceback first to find missing populations at the boundary
-            u_wall = self.profile_decoder_functional(f_1, index, timestep, _missing_mask)
+            u_wall = self.profile_functional(f_1, index, timestep)
             f_post = self.bc_helper.interpolated_bounceback(
                 index,
                 _missing_mask,
@@ -327,7 +291,7 @@ def hybrid_nonequilibrium_regularized(
             #     boundaries in the lattice Boltzmann method. Physical Review E 77, 056703.
 
             # Apply interpolated bounceback first to find missing populations at the boundary
-            u_wall = self.profile_decoder_functional(f_1, index, timestep, _missing_mask)
+            u_wall = self.profile_functional(f_1, index, timestep)
             f_post = self.bc_helper.interpolated_nonequilibrium_bounceback(
                 index,
                 _missing_mask,

From 31009b3b6f564a393ea07d5e68d10050c4fc7819 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 5 Sep 2025 11:36:59 -0400
Subject: [PATCH 174/208] addressed PR review comments

---
 xlb/grid/neon_grid.py                                   | 2 +-
 xlb/operator/boundary_condition/bc_hybrid.py            | 9 ++++-----
 xlb/operator/boundary_condition/helper_functions_bc.py  | 2 +-
 xlb/operator/boundary_masker/helper_functions_masker.py | 2 +-
 xlb/operator/boundary_masker/multires_aabb.py           | 7 ++++---
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/xlb/grid/neon_grid.py b/xlb/grid/neon_grid.py
index 36ae8ed2..ec54a5ae 100644
--- a/xlb/grid/neon_grid.py
+++ b/xlb/grid/neon_grid.py
@@ -114,7 +114,7 @@ def cloning(gridIdx: typing.Any):
                     gy = wp.neon_get_y(cIdx)
                     gz = wp.neon_get_z(cIdx)
 
-                    # TODO@Max - XLB is flattening the z dimension in 3D, while neon uses the y dimension
+                    # XLB is flattening the z dimension in 3D, while neon uses the y dimension
                     if _d == 2:
                         gy, gz = gz, gy
 
diff --git a/xlb/operator/boundary_condition/bc_hybrid.py b/xlb/operator/boundary_condition/bc_hybrid.py
index fd717fe3..4c79a51a 100644
--- a/xlb/operator/boundary_condition/bc_hybrid.py
+++ b/xlb/operator/boundary_condition/bc_hybrid.py
@@ -17,7 +17,6 @@
     HelperFunctionsBC,
 )
 from xlb.operator.boundary_masker.mesh_voxelization_method import MeshVoxelizationMethod
-from xlb.operator.boundary_condition.helper_functions_bc import EncodeAuxiliaryData
 
 
 class HybridBC(BoundaryCondition):
@@ -173,10 +172,10 @@ def distance_decoder_function(f_1: Any, index: Any, direction: Any):
     def _construct_profile_functional(self):
         """
         Get the profile functional for this BC.
-        Note:
-        We can impose a profile on a boundary which requires mesh-distance only if that boundary lives on the finest level.
-        This is because I don't know how to extract "level" from the
-        "neon_field_hdl" to do:
+        TODO@Hesam:
+        Right now, we can impose a profile on a boundary which requires mesh-distance only if that boundary lives on the finest level.
+        In order to extract "level" from the "neon_field_hdl" we can use the function wp.neon_level(neon_field_hdl). This will allow us
+        to do the following and get rid of the above limitation.
                cIdx = wp.neon_global_idx(field_neon_hdl, index)
                gx = wp.neon_get_x(cIdx) // 2 ** level
                gy = wp.neon_get_y(cIdx) // 2 ** level
diff --git a/xlb/operator/boundary_condition/helper_functions_bc.py b/xlb/operator/boundary_condition/helper_functions_bc.py
index 1b5f6625..8e7ae7b9 100644
--- a/xlb/operator/boundary_condition/helper_functions_bc.py
+++ b/xlb/operator/boundary_condition/helper_functions_bc.py
@@ -323,7 +323,7 @@ def neon_index_to_warp(neon_field_hdl: Any, index: Any):
             gy = wp.neon_get_y(cIdx)
             gz = wp.neon_get_z(cIdx)
 
-            # TODO@Max - XLB is flattening the z dimension in 3D, while neon uses the y dimension
+            # XLB is flattening the z dimension in 3D, while neon uses the y dimension
             if _d == 2:
                 gy, gz = gz, gy
 
diff --git a/xlb/operator/boundary_masker/helper_functions_masker.py b/xlb/operator/boundary_masker/helper_functions_masker.py
index 470f2362..cb2e3f8c 100644
--- a/xlb/operator/boundary_masker/helper_functions_masker.py
+++ b/xlb/operator/boundary_masker/helper_functions_masker.py
@@ -29,7 +29,7 @@ def neon_index_to_warp(neon_field_hdl: Any, index: Any):
             gy = wp.neon_get_y(cIdx)
             gz = wp.neon_get_z(cIdx)
 
-            # TODO@Max - XLB is flattening the z dimension in 3D, while neon uses the y dimension
+            # XLB is flattening the z dimension in 3D, while neon uses the y dimension
             if _d == 2:
                 gy, gz = gz, gy
 
diff --git a/xlb/operator/boundary_masker/multires_aabb.py b/xlb/operator/boundary_masker/multires_aabb.py
index eb8237f6..69e9e7bc 100644
--- a/xlb/operator/boundary_masker/multires_aabb.py
+++ b/xlb/operator/boundary_masker/multires_aabb.py
@@ -15,12 +15,13 @@ class MultiresMeshMaskerAABB(MeshMaskerAABB):
     This implementation uses warp.mesh_query_aabb for efficient mesh-voxel intersection testing,
     providing approximate 1-voxel thick surface detection around the mesh geometry.
     Suitable for scenarios where fast, approximate boundary detection is sufficient.
-    TODO:
-    We cannot properly mask a mesh file if it lives on any level other than the finest. This issue can be easily solved by adding
+    TODO@Hesam:
+    Right now, we cannot properly mask a mesh file if it lives on any level other than the finest. This issue can be easily solved by adding
            gx = wp.neon_get_x(cIdx) // 2 ** level
            gy = wp.neon_get_y(cIdx) // 2 ** level
            gz = wp.neon_get_z(cIdx) // 2 ** level
-    to the "neon_index_to_warp" and subsequently add "level" to the arguments of "index_to_position_neon", "get_pull_index_neon" and "is_in_bc_indices_neon"
+    to the "neon_index_to_warp" and subsequently add "level" to the arguments of "index_to_position_neon", "get_pull_index_neon" and
+    "is_in_bc_indices_neon". In order to extract "level" from the "neon_field_hdl" we can use the function wp.neon_level(neon_field_hdl).
     """
 
     def __init__(

From 0e61c587c716ce02fe09a03ec6a05f1a0be74409 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Sat, 6 Sep 2025 11:55:19 -0400
Subject: [PATCH 175/208] Fixed the missing mask in MRES that was incorrect on
 +x, +y and +z bounds of the domain

---
 examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py  | 2 +-
 examples/cfd/grid_refinement/flow_past_sphere_3d.py         | 2 +-
 xlb/operator/boundary_masker/helper_functions_masker.py     | 6 +++++-
 xlb/operator/boundary_masker/indices_boundary_masker.py     | 6 ++++--
 .../boundary_masker/multires_indices_boundary_masker.py     | 2 ++
 5 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
index 35770231..424c130e 100644
--- a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
@@ -211,7 +211,7 @@ def bc_profile_warp(index: wp.vec3i):
 # bc_left = HybridBC(bc_method="bounceback_regularized", profile=bc_profile(), indices=inlet)
 # Alternatively, use a prescribed velocity profile
 # bc_left = RegularizedBC("velocity", prescribed_value=(u_max, 0.0, 0.0), indices=inlet)
-bc_walls = FullwayBounceBackBC(indices=walls)  # TODO: issues with halfway bounce back only here!
+bc_walls = HalfwayBounceBackBC(indices=walls)
 # bc_ground = FullwayBounceBackBC(indices=grid.boundary_indices_across_levels(level_data, box_side="front"))
 # bc_outlet = ExtrapolationOutflowBC(indices=outlet)
 bc_outlet = DoNothingBC(indices=outlet)
diff --git a/examples/cfd/grid_refinement/flow_past_sphere_3d.py b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
index 7e87c332..399b2907 100644
--- a/examples/cfd/grid_refinement/flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
@@ -136,7 +136,7 @@ def bc_profile_warp(index: wp.vec3i):
 bc_left = RegularizedBC("velocity", profile=bc_profile(), indices=inlet)
 # Alternatively, use a prescribed velocity profile
 # bc_left = RegularizedBC("velocity", prescribed_value=(u_max, 0.0, 0.0), indices=inlet)
-bc_walls = FullwayBounceBackBC(indices=walls)  # TODO: issues with halfway bounce back only here!
+bc_walls = FullwayBounceBackBC(indices=walls)
 # bc_outlet = ExtrapolationOutflowBC(indices=outlet)
 bc_outlet = DoNothingBC(indices=outlet)
 bc_sphere = HalfwayBounceBackBC(indices=sphere)
diff --git a/xlb/operator/boundary_masker/helper_functions_masker.py b/xlb/operator/boundary_masker/helper_functions_masker.py
index cb2e3f8c..35e31194 100644
--- a/xlb/operator/boundary_masker/helper_functions_masker.py
+++ b/xlb/operator/boundary_masker/helper_functions_masker.py
@@ -66,11 +66,14 @@ def get_pull_index_warp(
             field: Any,
             lattice_dir: wp.int32,
             index: wp.vec3i,
+            level: Any,
         ):
             pull_index = wp.vec3i()
             offset = wp.vec3i()
             for d in range(self.velocity_set.d):
                 offset[d] = -_c[d, lattice_dir]
+                for _ in range(level):
+                    offset[d] *= 2
                 pull_index[d] = index[d] + offset[d]
 
             return pull_index, offset
@@ -80,10 +83,11 @@ def get_pull_index_neon(
             field: Any,
             lattice_dir: wp.int32,
             index: Any,
+            level: Any,
         ):
             # Convert the index to warp
             index_wp = neon_index_to_warp(field, index)
-            pull_index_wp, _ = get_pull_index_warp(field, lattice_dir, index_wp)
+            pull_index_wp, _ = get_pull_index_warp(field, lattice_dir, index_wp, level)
             offset = wp.neon_ngh_idx(wp.int8(-_c[0, lattice_dir]), wp.int8(-_c[1, lattice_dir]), wp.int8(-_c[2, lattice_dir]))
             return pull_index_wp, offset
 
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 55a1bb3d..8a8d684b 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -145,6 +145,7 @@ def functional_domain_bounds(
             bc_mask: Any,
             missing_mask: Any,
             grid_shape: Any,
+            level: Any = 0,
         ):
             for ii in range(bc_indices.shape[1]):
                 # If the current index does not match the boundary condition index, we skip it
@@ -164,7 +165,7 @@ def functional_domain_bounds(
                 # Stream indices
                 for l in range(_q):
                     # Get the pull index which is the index of the neighboring node where information is pulled from
-                    pull_index, _ = self.helper_masker.get_pull_index(bc_mask, l, index)
+                    pull_index, _ = self.helper_masker.get_pull_index(bc_mask, l, index, level)
 
                     # Check if pull index is out of bound
                     # These directions will have missing information after streaming
@@ -193,6 +194,7 @@ def functional_interior_missing_mask(
             bc_mask: Any,
             missing_mask: Any,
             grid_shape: Any,
+            level: Any = 0,
         ):
             for ii in range(bc_indices.shape[1]):
                 # If the current index does not match the boundary condition index, we skip it
@@ -200,7 +202,7 @@ def functional_interior_missing_mask(
                     continue
                 for l in range(_q):
                     # Get the index of the streaming direction
-                    pull_index, offset = self.helper_masker.get_pull_index(bc_mask, l, index)
+                    pull_index, offset = self.helper_masker.get_pull_index(bc_mask, l, index, level)
 
                     # Check if pull index is a fluid node (bc_mask is zero for fluid nodes)
                     bc_mask_ngh = self.read_field_neighbor(bc_mask, index, offset, 0)
diff --git a/xlb/operator/boundary_masker/multires_indices_boundary_masker.py b/xlb/operator/boundary_masker/multires_indices_boundary_masker.py
index c9e1e1a0..1c3279b5 100644
--- a/xlb/operator/boundary_masker/multires_indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/multires_indices_boundary_masker.py
@@ -61,6 +61,7 @@ def domain_bounds_kernel(index: Any):
                         bc_mask_pn,
                         missing_mask_pn,
                         grid_shape,
+                        level,
                     )
 
                 loader.declare_kernel(domain_bounds_kernel)
@@ -114,6 +115,7 @@ def interior_missing_mask_kernel(index: Any):
                         bc_mask_pn,
                         missing_mask_pn,
                         grid_shape,
+                        level,
                     )
 
                 loader.declare_kernel(interior_missing_mask_kernel)

From 11647c29a439c0d13850ac2c667b0071accaf81e Mon Sep 17 00:00:00 2001
From: nmorrisad <nigel.morris@autodesk.com>
Date: Fri, 12 Sep 2025 09:47:37 -0400
Subject: [PATCH 176/208] Multi-res AABB close (#17)

* Multi res AABB close neon impl, performs morpheological close operation on multi-res mesh and assigns boundary conditions appropriately

* Support for multiple close voxels so larger gaps can be filled
---
 examples/cfd/windtunnel_3d.py                 |   5 +-
 xlb/grid/neon_grid.py                         |   4 +-
 xlb/operator/boundary_masker/__init__.py      |   3 +-
 .../{aabb_fill.py => aabb_close.py}           | 144 ++++++++--
 .../mesh_voxelization_method.py               |   2 +-
 .../boundary_masker/multires_aabb_close.py    | 264 ++++++++++++++++++
 xlb/operator/force/momentum_transfer.py       |   3 +-
 xlb/operator/stepper/nse_multires_stepper.py  |   8 +-
 xlb/operator/stepper/nse_stepper.py           |   6 +-
 9 files changed, 407 insertions(+), 32 deletions(-)
 rename xlb/operator/boundary_masker/{aabb_fill.py => aabb_close.py} (56%)
 create mode 100644 xlb/operator/boundary_masker/multires_aabb_close.py

diff --git a/examples/cfd/windtunnel_3d.py b/examples/cfd/windtunnel_3d.py
index c8d69aa4..4442cad5 100644
--- a/examples/cfd/windtunnel_3d.py
+++ b/examples/cfd/windtunnel_3d.py
@@ -92,7 +92,7 @@
 # values would be fine but leave a gap between surfaces that are supposed to touch.
 if voxelization_method in (MeshVoxelizationMethod.RAY, MeshVoxelizationMethod.WINDING):
     shift_z = 2
-elif voxelization_method in (MeshVoxelizationMethod.AABB, MeshVoxelizationMethod.AABB_FILL):
+elif voxelization_method in (MeshVoxelizationMethod.AABB, MeshVoxelizationMethod.AABB_CLOSE):
     shift_z = 3
 shift = np.array([grid_shape[0] / 4, (grid_shape[1] - mesh_extents[1] / dx) / 2, shift_z])
 car_vertices = mesh_vertices + shift
@@ -103,7 +103,8 @@
 bc_walls = FullwayBounceBackBC(indices=walls)
 bc_do_nothing = ExtrapolationOutflowBC(indices=outlet)
 bc_car = HalfwayBounceBackBC(mesh_vertices=car_vertices, voxelization_method=voxelization_method)
-# bc_car = HybridBC(bc_method="nonequilibrium_regularized",  mesh_vertices=car_vertices, voxelization_method=voxelization_method, use_mesh_distance=True)
+# bc_car = HybridBC(bc_method="nonequilibrium_regularized",  mesh_vertices=car_vertices,
+#   voxelization_method=voxelization_method, use_mesh_distance=True)
 boundary_conditions = [bc_walls, bc_left, bc_do_nothing, bc_car]
 
 
diff --git a/xlb/grid/neon_grid.py b/xlb/grid/neon_grid.py
index ec54a5ae..3251f43b 100644
--- a/xlb/grid/neon_grid.py
+++ b/xlb/grid/neon_grid.py
@@ -30,10 +30,10 @@ def __init__(
 
         # check that the device list is a list of integers
         if not isinstance(backend_config["device_list"], list):
-            raise ValueError(f"backend_config['device_list'] must be a list of integers")
+            raise ValueError("backend_config['device_list'] must be a list of integers")
         for device in backend_config["device_list"]:
             if not isinstance(device, int):
-                raise ValueError(f"backend_config['device_list'] must be a list of integers")
+                raise ValueError("backend_config['device_list'] must be a list of integers")
 
         self.config = backend_config
         self.bk = None
diff --git a/xlb/operator/boundary_masker/__init__.py b/xlb/operator/boundary_masker/__init__.py
index 7cef263f..1cef3042 100644
--- a/xlb/operator/boundary_masker/__init__.py
+++ b/xlb/operator/boundary_masker/__init__.py
@@ -4,7 +4,8 @@
 from xlb.operator.boundary_masker.aabb import MeshMaskerAABB
 from xlb.operator.boundary_masker.ray import MeshMaskerRay
 from xlb.operator.boundary_masker.winding import MeshMaskerWinding
-from xlb.operator.boundary_masker.aabb_fill import MeshMaskerAABBFill
+from xlb.operator.boundary_masker.aabb_close import MeshMaskerAABBClose
 from xlb.operator.boundary_masker.mesh_voxelization_method import MeshVoxelizationMethod
 from xlb.operator.boundary_masker.multires_aabb import MultiresMeshMaskerAABB
+from xlb.operator.boundary_masker.multires_aabb_close import MultiresMeshMaskerAABBClose
 from xlb.operator.boundary_masker.multires_indices_boundary_masker import MultiresIndicesBoundaryMasker
diff --git a/xlb/operator/boundary_masker/aabb_fill.py b/xlb/operator/boundary_masker/aabb_close.py
similarity index 56%
rename from xlb/operator/boundary_masker/aabb_fill.py
rename to xlb/operator/boundary_masker/aabb_close.py
index 6386546d..a8034ca2 100644
--- a/xlb/operator/boundary_masker/aabb_fill.py
+++ b/xlb/operator/boundary_masker/aabb_close.py
@@ -11,7 +11,7 @@
 from xlb.operator.boundary_masker.mesh_boundary_masker import MeshBoundaryMasker
 
 
-class MeshMaskerAABBFill(MeshBoundaryMasker):
+class MeshMaskerAABBClose(MeshBoundaryMasker):
     """
     Operator for creating a boundary missing_mask from an STL file
     """
@@ -21,10 +21,11 @@ def __init__(
         velocity_set: VelocitySet = None,
         precision_policy: PrecisionPolicy = None,
         compute_backend: ComputeBackend = None,
-        fill_in_voxels: int = 3,
+        close_voxels: int = 3,
     ):
+        self.close_voxels = close_voxels
         # Call super
-        self.tile_half = fill_in_voxels
+        self.tile_half = close_voxels
         self.tile_size = self.tile_half * 2 + 1
         super().__init__(velocity_set, precision_policy, compute_backend)
 
@@ -35,33 +36,73 @@ def _construct_warp(self):
         _opp_indices = self.velocity_set.opp_indices
         TILE_SIZE = wp.constant(self.tile_size)
         TILE_HALF = wp.constant(self.tile_half)
+        lattice_central_index = self.velocity_set.center_index
 
-        # Erode the solid mask in f_field, removing a layer of outer solid voxels, storing output in f_field_out
+        # Erode the solid mask in mask_field, removing a layer of outer solid voxels, storing output in mask_field_out
         @wp.kernel
-        def erode_tile(f_field: wp.array3d(dtype=Any), f_field_out: wp.array3d(dtype=Any)):
+        def erode_tile(mask_field: wp.array3d(dtype=Any), mask_field_out: wp.array3d(dtype=Any)):
             i, j, k = wp.tid()
             index = wp.vec3i(i, j, k)
-            if not self.helper_masker.is_in_bounds(index, wp.vec3i(f_field.shape[0], f_field.shape[1], f_field.shape[2]), TILE_HALF):
-                f_field_out[i, j, k] = f_field[i, j, k]
+            if not self.helper_masker.is_in_bounds(index, wp.vec3i(mask_field.shape[0], mask_field.shape[1], mask_field.shape[2]), TILE_HALF):
+                mask_field_out[i, j, k] = mask_field[i, j, k]
                 return
-            t = wp.tile_load(f_field, shape=(TILE_SIZE, TILE_SIZE, TILE_SIZE), offset=(i - TILE_HALF, j - TILE_HALF, k - TILE_HALF))
+            t = wp.tile_load(mask_field, shape=(TILE_SIZE, TILE_SIZE, TILE_SIZE), offset=(i - TILE_HALF, j - TILE_HALF, k - TILE_HALF))
             min_val = wp.tile_min(t)
-            f_field_out[i, j, k] = min_val[0]
+            mask_field_out[i, j, k] = min_val[0]
 
-        # Dilate the solid mask in f_field, adding a layer of outer solid voxels, storing output in f_field_out
+        # Dilate the solid mask in mask_field, adding a layer of outer solid voxels, storing output in mask_field_out
         @wp.kernel
-        def dilate_tile(f_field: wp.array3d(dtype=Any), f_field_out: wp.array3d(dtype=Any)):
+        def dilate_tile(mask_field: wp.array3d(dtype=Any), mask_field_out: wp.array3d(dtype=Any)):
             i, j, k = wp.tid()
             index = wp.vec3i(i, j, k)
-            if not self.helper_masker.is_in_bounds(index, wp.vec3i(f_field.shape[0], f_field.shape[1], f_field.shape[2]), TILE_HALF):
-                f_field_out[i, j, k] = f_field[i, j, k]
+            if not self.helper_masker.is_in_bounds(index, wp.vec3i(mask_field.shape[0], mask_field.shape[1], mask_field.shape[2]), TILE_HALF):
+                mask_field_out[i, j, k] = mask_field[i, j, k]
                 return
-            t = wp.tile_load(f_field, shape=(TILE_SIZE, TILE_SIZE, TILE_SIZE), offset=(i - TILE_HALF, j - TILE_HALF, k - TILE_HALF))
+            t = wp.tile_load(mask_field, shape=(TILE_SIZE, TILE_SIZE, TILE_SIZE), offset=(i - TILE_HALF, j - TILE_HALF, k - TILE_HALF))
             max_val = wp.tile_max(t)
-            f_field_out[i, j, k] = max_val[0]
+            mask_field_out[i, j, k] = max_val[0]
+
+        # Erode the solid mask in mask_field, removing a layer of outer solid voxels, storing output in mask_field_out
+        @wp.func
+        def functional_erode(index: Any, mask_field: Any, mask_field_out: Any):
+            min_val = wp.uint8(255)
+            for l in range(_q):
+                if l == lattice_central_index:
+                    continue
+                is_valid = wp.bool(False)
+                ngh = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
+                ngh_val = wp.neon_read_ngh(mask_field, index, ngh, 0, wp.uint8(0), is_valid)
+                if is_valid:
+                    # Take the min value of all neighbors in bounds
+                    min_val = wp.min(min_val, ngh_val)
+            self.write_field(mask_field_out, index, 0, min_val)
+
+        # Dilate the solid mask in mask_field, adding a layer of outer solid voxels, storing output in mask_field_out
+        @wp.func
+        def functional_dilate(index: Any, mask_field: Any, mask_field_out: Any):
+            max_val = wp.uint8(0)
+            for l in range(_q):
+                if l == lattice_central_index:
+                    continue
+                is_valid = wp.bool(False)
+                ngh = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
+                ngh_val = wp.neon_read_ngh(mask_field, index, ngh, 0, wp.uint8(0), is_valid)
+                if is_valid:
+                    max_val = wp.max(max_val, ngh_val)
+            self.write_field(mask_field_out, index, 0, max_val)
 
         # Construct the warp kernel
         # Find solid voxels that intersect the mesh
+        @wp.func
+        def functional_solid(index: Any, mesh_id: Any, solid_mask: Any, offset: Any):
+            # position of the point
+            cell_center_pos = self.helper_masker.index_to_position(solid_mask, index) + offset
+            half = wp.vec3(0.5, 0.5, 0.5)
+
+            if self.mesh_voxel_intersect(mesh_id=mesh_id, low=cell_center_pos - half):
+                # Make solid voxel
+                self.write_field(solid_mask, index, 0, wp.uint8(255))
+
         @wp.kernel
         def kernel_solid(
             mesh_id: wp.uint64,
@@ -74,13 +115,65 @@ def kernel_solid(
             # Get local indices
             index = wp.vec3i(i, j, k)
 
+            functional_solid(index, mesh_id, solid_mask, offset)
+
+            return
+
+        @wp.func
+        def functional_aabb(
+            index: Any,
+            mesh_id: wp.uint64,
+            id_number: wp.int32,
+            distances: wp.array4d(dtype=Any),
+            bc_mask: wp.array4d(dtype=wp.uint8),
+            missing_mask: wp.array4d(dtype=wp.uint8),
+            solid_mask: wp.array3d(dtype=wp.uint8),
+            needs_mesh_distance: bool,
+        ):
             # position of the point
-            cell_center_pos = self.helper_masker.index_to_position(solid_mask, index) + offset
-            half = wp.vec3(0.5, 0.5, 0.5)
+            cell_center_pos = self.helper_masker.index_to_position(bc_mask, index)
+            HALF_VOXEL = wp.vec3(0.5, 0.5, 0.5)
 
-            if self.mesh_voxel_intersect(mesh_id=mesh_id, low=cell_center_pos - half):
+            if self.read_field(solid_mask, index, 0) == wp.uint8(255) or self.read_field(bc_mask, index, 0) == wp.uint8(255):
                 # Make solid voxel
-                solid_mask[index[0], index[1], index[2]] = wp.int32(255)
+                self.write_field(bc_mask, index, 0, wp.uint8(255))
+            else:
+                # Find the boundary voxels and their missing directions
+                for direction_idx in range(_q):
+                    if direction_idx == lattice_central_index:
+                        # Skip the central index as it is not relevant for boundary masking
+                        continue
+
+                    # Get the lattice direction vector
+                    direction_vec = wp.vec3f(wp.float32(_c[0, direction_idx]), wp.float32(_c[1, direction_idx]), wp.float32(_c[2, direction_idx]))
+
+                    # Check to see if this neighbor is solid
+                    if self.helper_masker.is_in_bounds(index, wp.vec3i(solid_mask.shape[0], solid_mask.shape[1], solid_mask.shape[2]), 1):
+                        if self.read_field(solid_mask, index + direction_idx, 0) == wp.uint8(255):
+                            # We know we have a solid neighbor
+                            # Set the boundary id and missing_mask
+                            self.write_field(bc_mask, index, 0, wp.uint8(id_number))
+                            self.write_field(missing_mask, index, _opp_indices[direction_idx], wp.uint8(True))
+
+                            # If we don't need the mesh distance, we can return early
+                            if not needs_mesh_distance:
+                                continue
+
+                            # Find the fractional distance to the mesh in each direction
+                            # We increase max_length to find intersections in neighboring cells
+                            max_length = wp.length(direction_vec)
+                            query = wp.mesh_query_ray(mesh_id, cell_center_pos, direction_vec / max_length, 1.5 * max_length)
+                            if query.result:
+                                # get position of the mesh triangle that intersects with the ray
+                                pos_mesh = wp.mesh_eval_position(mesh_id, query.face, query.u, query.v)
+                                # We reduce the distance to give some wall thickness
+                                dist = wp.length(pos_mesh - cell_center_pos) - 0.5 * max_length
+                                weight = dist / max_length
+                                self.write_field(distances, index, direction_idx, self.store_dtype(weight))
+                            else:
+                                # Expected an intersection in this direction but none was found.
+                                # Assume the solid extends one lattice unit beyond the BC voxel leading to a distance fraction of 1.
+                                self.write_field(distances, index, direction_idx, self.store_dtype(1.0))
 
         # Assign the bc_mask and distances based on the solid_mask we already computed
         @wp.kernel
@@ -107,7 +200,10 @@ def kernel(
                 bc_mask[0, index[0], index[1], index[2]] = wp.uint8(255)
             else:
                 # Find the boundary voxels and their missing directions
-                for direction_idx in range(1, _q):
+                for direction_idx in range(_q):
+                    if direction_idx == lattice_central_index:
+                        # Skip the central index as it is not relevant for boundary masking
+                        continue
                     direction_vec = wp.vec3f(wp.float32(_c[0, direction_idx]), wp.float32(_c[1, direction_idx]), wp.float32(_c[2, direction_idx]))
 
                     # Check to see if this neighbor is solid - this is super inefficient TODO: make it way better
@@ -138,13 +234,19 @@ def kernel(
                             # and one lattice direction away from the BC voxel
                             distances[direction_idx, index[0], index[1], index[2]] = self.store_dtype(1.0)
 
+        functional_dict = {
+            "functional_erode": functional_erode,
+            "functional_dilate": functional_dilate,
+            "functional_solid": functional_solid,
+            "functional_aabb": functional_aabb,
+        }
         kernel_dict = {
             "kernel": kernel,
             "kernel_solid": kernel_solid,
             "erode_tile": erode_tile,
             "dilate_tile": dilate_tile,
         }
-        return None, kernel_dict
+        return functional_dict, kernel_dict
 
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(
diff --git a/xlb/operator/boundary_masker/mesh_voxelization_method.py b/xlb/operator/boundary_masker/mesh_voxelization_method.py
index ce5eb93e..d10dc8a8 100644
--- a/xlb/operator/boundary_masker/mesh_voxelization_method.py
+++ b/xlb/operator/boundary_masker/mesh_voxelization_method.py
@@ -6,5 +6,5 @@
 class MeshVoxelizationMethod(Enum):
     AABB = auto()
     RAY = auto()
-    AABB_FILL = auto()
+    AABB_CLOSE = auto()
     WINDING = auto()
diff --git a/xlb/operator/boundary_masker/multires_aabb_close.py b/xlb/operator/boundary_masker/multires_aabb_close.py
new file mode 100644
index 00000000..c2b4a011
--- /dev/null
+++ b/xlb/operator/boundary_masker/multires_aabb_close.py
@@ -0,0 +1,264 @@
+import warp as wp
+from typing import Any
+from xlb.velocity_set.velocity_set import VelocitySet
+from xlb.precision_policy import PrecisionPolicy
+from xlb.compute_backend import ComputeBackend
+from xlb.operator.boundary_masker import MeshMaskerAABBClose
+from xlb.operator.operator import Operator
+import neon
+
+
+class MultiresMeshMaskerAABBClose(MeshMaskerAABBClose):
+    """
+    Operator for creating boundary missing_mask from mesh using Axis-Aligned Bounding Box (AABB) voxelization
+    in multiresolution simulations (NEON backend). It takes in a number of close_voxels to perform morphological
+    operations (dilate followed by erode) to ensure small channels are filled with solid voxels.
+
+    This version provides NEON-specific functionals working on multires partitions (mPartition) and bIndex.
+    """
+
+    def __init__(
+        self,
+        velocity_set: VelocitySet = None,
+        precision_policy: PrecisionPolicy = None,
+        compute_backend: ComputeBackend = None,
+        close_voxels: int = 4,
+    ):
+        super().__init__(velocity_set, precision_policy, compute_backend, close_voxels)
+        if self.compute_backend in [ComputeBackend.JAX, ComputeBackend.WARP]:
+            raise NotImplementedError(f"Operator {self.__class__.__name__} not supported in {self.compute_backend} backend.")
+
+        # Build and store NEON dicts
+        self.neon_functional_dict, self.neon_container_dict = self._construct_neon()
+
+    def _construct_neon(self):
+        # Use the warp functionals from the base (for reference), but implement NEON variants here
+        functional_dict_warp, _ = self._construct_warp()
+        functional_erode_warp = functional_dict_warp.get("functional_erode")
+        functional_dilate_warp = functional_dict_warp.get("functional_dilate")
+        functional_solid = functional_dict_warp.get("functional_solid")
+        # We will not directly reuse functional_solid / functional_aabb from warp; we write NEON-specific ones.
+
+        # We also need lattice info for neighbor iteration
+        _c = self.velocity_set.c
+        _q = self.velocity_set.q
+        _opp_indices = self.velocity_set.opp_indices
+
+        # Set local constants
+        lattice_central_index = self.velocity_set.center_index
+
+        # Main AABB close: sets bc_mask, missing_mask, distances based on solid_mask
+        # bc_mask: wp.uint8, missing_mask: wp.uint8, distances: dtype from precision policy (float)
+        @wp.func
+        def mres_functional_aabb(
+            index: Any,
+            mesh_id: wp.uint64,
+            id_number: wp.int32,
+            distances_pn: Any,  # mPartition(dtype=distance type), cardinality=_q
+            bc_mask_pn: Any,  # mPartition_uint8, cardinality=1
+            missing_mask_pn: Any,  # mPartition_uint8, cardinality=_q
+            solid_mask_pn: Any,  # mPartition_uint8, cardinality=1
+            needs_mesh_distance: bool,
+        ):
+            # Cell center from bc_mask partition
+            cell_center = self.helper_masker.index_to_position(bc_mask_pn, index)
+
+            # If already solid or bc, mark solid
+            solid_val = wp.neon_read(solid_mask_pn, index, 0)
+            bc_val = wp.neon_read(bc_mask_pn, index, 0)
+            if solid_val == wp.uint8(255) or bc_val == wp.uint8(255):
+                wp.neon_write(bc_mask_pn, index, 0, wp.uint8(255))
+                return
+
+            # loop lattice directions
+            for direction_idx in range(_q):
+                # skip central if provided by velocity set
+                if direction_idx == lattice_central_index:
+                    continue
+
+                # If neighbor index is valid at this resolution level
+                ngh = wp.neon_ngh_idx(wp.int8(_c[0, direction_idx]), wp.int8(_c[1, direction_idx]), wp.int8(_c[2, direction_idx]))
+                is_valid = wp.bool(False)
+                nval = wp.neon_read_ngh(solid_mask_pn, index, ngh, 0, wp.uint8(0), is_valid)
+                if is_valid:
+                    if nval == wp.uint8(255):
+                        # Found solid neighbor -> boundary cell
+                        self.write_field(bc_mask_pn, index, 0, wp.uint8(id_number))
+                        self.write_field(missing_mask_pn, index, _opp_indices[direction_idx], wp.uint8(True))
+
+                        if not needs_mesh_distance:
+                            # No distance needed; continue to next direction
+                            continue
+
+                        # Compute mesh distance along lattice direction
+                        dir_vec = wp.vec3f(
+                            wp.float32(_c[0, direction_idx]),
+                            wp.float32(_c[1, direction_idx]),
+                            wp.float32(_c[2, direction_idx]),
+                        )
+                        max_length = wp.length(dir_vec)
+                        # Avoid division by zero for any pathological dir (shouldn't happen)
+                        norm_dir = dir_vec / (max_length if max_length > 0.0 else 1.0)
+                        query = wp.mesh_query_ray(mesh_id, cell_center, norm_dir, 1.5 * max_length)
+                        if query.result:
+                            pos_mesh = wp.mesh_eval_position(mesh_id, query.face, query.u, query.v)
+                            dist = wp.length(pos_mesh - cell_center) - 0.5 * max_length
+                            weight = dist / (max_length if max_length > 0.0 else 1.0)
+                            # distances has cardinality _q; store into this channel
+                            self.write_field(distances_pn, index, direction_idx, wp.float32(weight))
+                        else:
+                            self.write_field(distances_pn, index, direction_idx, wp.float32(1.0))
+
+        # Containers
+
+        # Erode: f_field -> f_field_out
+        @neon.Container.factory(name="Erode")
+        def container_erode(f_field: wp.array3d(dtype=Any), f_field_out: wp.array3d(dtype=Any), level: int):
+            def erode_launcher(loader: neon.Loader):
+                loader.set_mres_grid(f_field.get_grid(), level)
+                f_field_pn = loader.get_mres_read_handle(f_field)
+                f_field_out_pn = loader.get_mres_write_handle(f_field_out)
+
+                @wp.func
+                def erode_kernel(index: Any):
+                    functional_erode_warp(index, f_field_pn, f_field_out_pn)
+
+                loader.declare_kernel(erode_kernel)
+
+            return erode_launcher
+
+        # Dilate: f_field -> f_field_out
+        @neon.Container.factory(name="Dilate")
+        def container_dilate(f_field: wp.array3d(dtype=Any), f_field_out: wp.array3d(dtype=Any), level: int):
+            def dilate_launcher(loader: neon.Loader):
+                loader.set_mres_grid(f_field.get_grid(), level)
+                f_field_pn = loader.get_mres_read_handle(f_field)
+                f_field_out_pn = loader.get_mres_write_handle(f_field_out)
+
+                @wp.func
+                def dilate_kernel(index: Any):
+                    functional_dilate_warp(index, f_field_pn, f_field_out_pn)
+
+                loader.declare_kernel(dilate_kernel)
+
+            return dilate_launcher
+
+        # Solid mask: voxelize mesh into solid_mask
+        @neon.Container.factory(name="Solid")
+        def container_solid(mesh_id: wp.uint64, solid_mask: wp.array3d(dtype=wp.uint8), level: int):
+            def solid_launcher(loader: neon.Loader):
+                loader.set_mres_grid(solid_mask.get_grid(), level)
+                solid_mask_pn = loader.get_mres_write_handle(solid_mask)
+
+                @wp.func
+                def solid_kernel(index: Any):
+                    # apply the functional
+                    functional_solid(index, mesh_id, solid_mask_pn, wp.vec3f(0.0, 0.0, 0.0))
+
+                loader.declare_kernel(solid_kernel)
+
+            return solid_launcher
+
+        # Main AABB container
+        @neon.Container.factory(name="MeshMaskerAABBClose")
+        def container(
+            mesh_id: Any,
+            id_number: Any,
+            distances: Any,
+            bc_mask: Any,
+            missing_mask: Any,
+            solid_mask: Any,
+            needs_mesh_distance: Any,
+            level: Any,
+        ):
+            def aabb_launcher(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask.get_grid(), level)
+                distances_pn = loader.get_mres_write_handle(distances)
+                bc_mask_pn = loader.get_mres_write_handle(bc_mask)
+                missing_mask_pn = loader.get_mres_write_handle(missing_mask)
+                solid_mask_pn = loader.get_mres_write_handle(solid_mask)
+
+                @wp.func
+                def aabb_kernel(index: Any):
+                    mres_functional_aabb(
+                        index,
+                        mesh_id,
+                        id_number,
+                        distances_pn,
+                        bc_mask_pn,
+                        missing_mask_pn,
+                        solid_mask_pn,
+                        needs_mesh_distance,
+                    )
+
+                loader.declare_kernel(aabb_kernel)
+
+            return aabb_launcher
+
+        container_dict = {
+            "container_erode": container_erode,
+            "container_dilate": container_dilate,
+            "container_solid": container_solid,
+            "container_aabb": container,
+        }
+
+        # Expose NEON functionals too (in case callers want to reuse)
+        functional_dict = {
+            "mres_functional_aabb": mres_functional_aabb,
+        }
+
+        return functional_dict, container_dict
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(
+        self,
+        bc,
+        distances,
+        bc_mask,
+        missing_mask,
+        stream=0,
+    ):
+        # Prepare inputs
+        mesh_id, bc_id = self._prepare_kernel_inputs(bc, bc_mask)
+
+        grid = bc_mask.get_grid()
+        # Create fields using new_field
+        solid_mask = grid.new_field(cardinality=1, dtype=wp.uint8, memory_type=neon.MemoryType.device())
+        solid_mask_out = grid.new_field(
+            cardinality=1,
+            dtype=wp.uint8,
+            memory_type=neon.MemoryType.device(),
+            # memory_type=neon.MemoryType.host_device()
+        )
+
+        for level in range(grid.num_levels):
+            # Initialize to 0
+            solid_mask.fill_run(level=level, value=wp.uint8(0), stream_idx=stream)
+            solid_mask_out.fill_run(level=level, value=wp.uint8(0), stream_idx=stream)
+
+            # Launch the neon containers
+            container_solid = self.neon_container_dict["container_solid"](mesh_id, solid_mask, level)
+            container_solid.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
+
+            for _ in range(self.close_voxels):
+                container_dilate = self.neon_container_dict["container_dilate"](solid_mask, solid_mask_out, level)
+                container_dilate.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
+                solid_mask, solid_mask_out = solid_mask_out, solid_mask
+
+            if self.close_voxels % 2 > 0:
+                solid_mask, solid_mask_out = solid_mask_out, solid_mask
+
+            for _ in range(self.close_voxels):
+                container_erode = self.neon_container_dict["container_erode"](solid_mask_out, solid_mask, level)
+                container_erode.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
+                solid_mask, solid_mask_out = solid_mask_out, solid_mask
+
+            if self.close_voxels % 2 > 0:
+                solid_mask, solid_mask_out = solid_mask_out, solid_mask
+
+            container_aabb = self.neon_container_dict["container_aabb"](
+                mesh_id, bc_id, distances, bc_mask, missing_mask, solid_mask, wp.static(bc.needs_mesh_distance), level
+            )
+            container_aabb.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
+
+        return distances, bc_mask, missing_mask
diff --git a/xlb/operator/force/momentum_transfer.py b/xlb/operator/force/momentum_transfer.py
index 1ea80555..c5de920c 100644
--- a/xlb/operator/force/momentum_transfer.py
+++ b/xlb/operator/force/momentum_transfer.py
@@ -17,7 +17,8 @@
 class LBMOperationSequence(Enum):
     """
     Note that for dense and single resolution simulations in XLB, the order of operations in the stepper is "stream-then-collide".
-    For MultiRes stepper however the order of operations is always "collide-then-stream" except at the finest level when the FUSION_AT_FINEST optimization is used.
+    For MultiRes stepper however the order of operations is always "collide-then-stream" except at the finest level when the FUSION_AT_FINEST
+    optimization is used.
     In that case the order of operations is "stream-then-collide" ONLY at the finest level.
     """
 
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 6e5e3889..fd391c22 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -17,7 +17,7 @@
 from xlb.operator.boundary_condition.boundary_condition_registry import boundary_condition_registry
 from xlb.operator.collision import ForcedCollision
 from xlb.helper import check_bc_overlaps
-from xlb.operator.boundary_masker import MeshVoxelizationMethod, MultiresMeshMaskerAABB, MultiresIndicesBoundaryMasker
+from xlb.operator.boundary_masker import MeshVoxelizationMethod, MultiresMeshMaskerAABB, MultiresMeshMaskerAABBClose, MultiresIndicesBoundaryMasker
 from xlb.operator.boundary_condition.helper_functions_bc import MultiresEncodeAuxiliaryData
 
 
@@ -236,6 +236,12 @@ def _process_boundary_conditions(cls, boundary_conditions, f_1, bc_mask, missing
                         precision_policy=DefaultConfig.default_precision_policy,
                         compute_backend=DefaultConfig.default_backend,
                     )
+                elif bc.voxelization_method is MeshVoxelizationMethod.AABB_CLOSE:
+                    mesh_masker = MultiresMeshMaskerAABBClose(
+                        velocity_set=DefaultConfig.velocity_set,
+                        precision_policy=DefaultConfig.default_precision_policy,
+                        compute_backend=DefaultConfig.default_backend,
+                    )
                 else:
                     raise ValueError(f"Unsupported voxelization method for multi-res: {bc.voxelization_method}")
                 # Apply the mesh masker to the boundary condition
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index a2f84314..96aa75f8 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -25,7 +25,7 @@
     MeshMaskerAABB,
     MeshMaskerRay,
     MeshMaskerWinding,
-    MeshMaskerAABBFill,
+    MeshMaskerAABBClose,
 )
 from xlb.helper import check_bc_overlaps
 from xlb.helper.nse_fields import create_nse_fields
@@ -153,8 +153,8 @@ def _process_boundary_conditions(self, boundary_conditions, f_1, bc_mask, missin
                         precision_policy=DefaultConfig.default_precision_policy,
                         compute_backend=DefaultConfig.default_backend,
                     )
-                elif bc.voxelization_method is MeshVoxelizationMethod.AABB_FILL:
-                    mesh_masker = MeshMaskerAABBFill(
+                elif bc.voxelization_method is MeshVoxelizationMethod.AABB_CLOSE:
+                    mesh_masker = MeshMaskerAABBClose(
                         velocity_set=DefaultConfig.velocity_set,
                         precision_policy=DefaultConfig.default_precision_policy,
                         compute_backend=DefaultConfig.default_backend,

From d81fd2159d542844d2a0566f1591fd790d8a15e6 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hsalehipour@gmail.com>
Date: Mon, 15 Sep 2025 08:33:25 -0400
Subject: [PATCH 177/208] Mesh voxelization method (#22)

mesh_voxelization_method is callable now with optional input arguments. Also bc values are not hardcoded in multires macroscopic kernel.
---
 .../cuboid_flow_past_sphere_3d.py             |  4 +--
 examples/cfd/rotating_sphere_3d.py            |  2 +-
 examples/cfd/windtunnel_3d.py                 |  6 ++--
 xlb/operator/boundary_masker/aabb_close.py    |  5 +++-
 .../mesh_voxelization_method.py               | 28 ++++++++++++++-----
 .../boundary_masker/multires_aabb_close.py    |  2 +-
 .../macroscopic/multires_macroscopic.py       |  5 ----
 xlb/operator/stepper/nse_multires_stepper.py  |  5 ++--
 xlb/operator/stepper/nse_stepper.py           |  9 +++---
 9 files changed, 40 insertions(+), 26 deletions(-)

diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
index 424c130e..14ae5ffa 100644
--- a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
@@ -215,9 +215,9 @@ def bc_profile_warp(index: wp.vec3i):
 # bc_ground = FullwayBounceBackBC(indices=grid.boundary_indices_across_levels(level_data, box_side="front"))
 # bc_outlet = ExtrapolationOutflowBC(indices=outlet)
 bc_outlet = DoNothingBC(indices=outlet)
-# bc_sphere = HalfwayBounceBackBC(mesh_vertices=sphere, voxelization_method=MeshVoxelizationMethod.AABB)
+# bc_sphere = HalfwayBounceBackBC(mesh_vertices=sphere, voxelization_method=MeshVoxelizationMethod('AABB'))
 bc_sphere = HybridBC(
-    bc_method="nonequilibrium_regularized", mesh_vertices=sphere, voxelization_method=MeshVoxelizationMethod.AABB, use_mesh_distance=True
+    bc_method="nonequilibrium_regularized", mesh_vertices=sphere, voxelization_method=MeshVoxelizationMethod("AABB"), use_mesh_distance=True
 )
 
 boundary_conditions = [bc_walls, bc_left, bc_outlet, bc_sphere]
diff --git a/examples/cfd/rotating_sphere_3d.py b/examples/cfd/rotating_sphere_3d.py
index 03906640..b1e29e33 100644
--- a/examples/cfd/rotating_sphere_3d.py
+++ b/examples/cfd/rotating_sphere_3d.py
@@ -130,7 +130,7 @@ def bc_profile_warp(index: wp.vec3i):
     bc_method="nonequilibrium_regularized",
     mesh_vertices=sphere,
     use_mesh_distance=True,
-    voxelization_method=MeshVoxelizationMethod.RAY,
+    voxelization_method=MeshVoxelizationMethod("RAY"),
     profile=bc_profile(),
 )
 # Not assining BC for walls makes them periodic.
diff --git a/examples/cfd/windtunnel_3d.py b/examples/cfd/windtunnel_3d.py
index 4442cad5..3786c4db 100644
--- a/examples/cfd/windtunnel_3d.py
+++ b/examples/cfd/windtunnel_3d.py
@@ -75,7 +75,7 @@
 
 # Load the mesh (replace with your own mesh)
 stl_filename = "../stl-files/DrivAer-Notchback.stl"
-voxelization_method = MeshVoxelizationMethod.RAY
+voxelization_method = MeshVoxelizationMethod("RAY")
 mesh = trimesh.load_mesh(stl_filename, process=False)
 mesh_vertices = mesh.vertices
 
@@ -90,9 +90,9 @@
 # Depending on the voxelization method, shift_z ensures the bottom ground does not intersect with the voxelized mesh
 # Any smaller shift value would lead to large lift computations due to the initial equilibrium distributions. Bigger
 # values would be fine but leave a gap between surfaces that are supposed to touch.
-if voxelization_method in (MeshVoxelizationMethod.RAY, MeshVoxelizationMethod.WINDING):
+if voxelization_method in (MeshVoxelizationMethod("RAY"), MeshVoxelizationMethod("WINDING")):
     shift_z = 2
-elif voxelization_method in (MeshVoxelizationMethod.AABB, MeshVoxelizationMethod.AABB_CLOSE):
+elif voxelization_method in (MeshVoxelizationMethod("AABB"), MeshVoxelizationMethod("AABB_CLOSE", close_voxels=3)):
     shift_z = 3
 shift = np.array([grid_shape[0] / 4, (grid_shape[1] - mesh_extents[1] / dx) / 2, shift_z])
 car_vertices = mesh_vertices + shift
diff --git a/xlb/operator/boundary_masker/aabb_close.py b/xlb/operator/boundary_masker/aabb_close.py
index a8034ca2..ea3ac636 100644
--- a/xlb/operator/boundary_masker/aabb_close.py
+++ b/xlb/operator/boundary_masker/aabb_close.py
@@ -21,8 +21,11 @@ def __init__(
         velocity_set: VelocitySet = None,
         precision_policy: PrecisionPolicy = None,
         compute_backend: ComputeBackend = None,
-        close_voxels: int = 3,
+        close_voxels: int = None,
     ):
+        assert close_voxels is not None, (
+            "Please provide the number of close voxels using the 'close_voxels' argument! e.g., MeshVoxelizationMethod('AABB_CLOSE', close_voxels=3)"
+        )
         self.close_voxels = close_voxels
         # Call super
         self.tile_half = close_voxels
diff --git a/xlb/operator/boundary_masker/mesh_voxelization_method.py b/xlb/operator/boundary_masker/mesh_voxelization_method.py
index d10dc8a8..3d2b1d6b 100644
--- a/xlb/operator/boundary_masker/mesh_voxelization_method.py
+++ b/xlb/operator/boundary_masker/mesh_voxelization_method.py
@@ -1,10 +1,24 @@
-# Enum used to keep track of the available voxelization methods
+# A class used to keep track of the available voxelization methods
 
-from enum import Enum, auto
+from dataclasses import dataclass
 
 
-class MeshVoxelizationMethod(Enum):
-    AABB = auto()
-    RAY = auto()
-    AABB_CLOSE = auto()
-    WINDING = auto()
+# Registry
+METHODS = {
+    "AABB": 1,
+    "RAY": 2,
+    "AABB_CLOSE": 3,
+    "WINDING": 4,
+}
+
+
+@dataclass
+class VoxelizationMethod:
+    id: int
+    name: str
+    options: dict
+
+
+def MeshVoxelizationMethod(name: str, **options):
+    assert name in METHODS.keys(), f"Unsupported voxelization method: {name}"
+    return VoxelizationMethod(METHODS[name], name, options)
diff --git a/xlb/operator/boundary_masker/multires_aabb_close.py b/xlb/operator/boundary_masker/multires_aabb_close.py
index c2b4a011..919fdb31 100644
--- a/xlb/operator/boundary_masker/multires_aabb_close.py
+++ b/xlb/operator/boundary_masker/multires_aabb_close.py
@@ -22,7 +22,7 @@ def __init__(
         velocity_set: VelocitySet = None,
         precision_policy: PrecisionPolicy = None,
         compute_backend: ComputeBackend = None,
-        close_voxels: int = 4,
+        close_voxels: int = None,
     ):
         super().__init__(velocity_set, precision_policy, compute_backend, close_voxels)
         if self.compute_backend in [ComputeBackend.JAX, ComputeBackend.WARP]:
diff --git a/xlb/operator/macroscopic/multires_macroscopic.py b/xlb/operator/macroscopic/multires_macroscopic.py
index 6cab84d8..0de4f54a 100644
--- a/xlb/operator/macroscopic/multires_macroscopic.py
+++ b/xlb/operator/macroscopic/multires_macroscopic.py
@@ -55,11 +55,6 @@ def macroscopic_cl(gIdx: typing.Any):
 
                     _rho, _u = functional(_f)
 
-                    if _boundary_id != wp.uint8(0):
-                        _rho = self.compute_dtype(1.0)
-                        for d in range(_d):
-                            _u[d] = self.compute_dtype(0.0)
-
                     if _boundary_id == wp.uint8(255) or wp.neon_has_child(f, gIdx):
                         _rho = self.compute_dtype(0.0)
                         for d in range(_d):
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index fd391c22..85b66842 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -230,17 +230,18 @@ def _process_boundary_conditions(cls, boundary_conditions, f_1, bc_mask, missing
         # Process mesh-based boundary conditions for 3D
         if DefaultConfig.velocity_set.d == 3 and bc_with_vertices:
             for bc in bc_with_vertices:
-                if bc.voxelization_method is MeshVoxelizationMethod.AABB:
+                if bc.voxelization_method.id is MeshVoxelizationMethod("AABB").id:
                     mesh_masker = MultiresMeshMaskerAABB(
                         velocity_set=DefaultConfig.velocity_set,
                         precision_policy=DefaultConfig.default_precision_policy,
                         compute_backend=DefaultConfig.default_backend,
                     )
-                elif bc.voxelization_method is MeshVoxelizationMethod.AABB_CLOSE:
+                elif bc.voxelization_method.id is MeshVoxelizationMethod("AABB_CLOSE").id:
                     mesh_masker = MultiresMeshMaskerAABBClose(
                         velocity_set=DefaultConfig.velocity_set,
                         precision_policy=DefaultConfig.default_precision_policy,
                         compute_backend=DefaultConfig.default_backend,
+                        close_voxels=bc.voxelization_method.options.get("close_voxels"),
                     )
                 else:
                     raise ValueError(f"Unsupported voxelization method for multi-res: {bc.voxelization_method}")
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 96aa75f8..e4c2e6fc 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -135,29 +135,30 @@ def _process_boundary_conditions(self, boundary_conditions, f_1, bc_mask, missin
         # Process mesh-based boundary conditions for 3D
         if DefaultConfig.velocity_set.d == 3 and bc_with_vertices:
             for bc in bc_with_vertices:
-                if bc.voxelization_method is MeshVoxelizationMethod.AABB:
+                if bc.voxelization_method.id is MeshVoxelizationMethod("AABB").id:
                     mesh_masker = MeshMaskerAABB(
                         velocity_set=DefaultConfig.velocity_set,
                         precision_policy=DefaultConfig.default_precision_policy,
                         compute_backend=DefaultConfig.default_backend,
                     )
-                elif bc.voxelization_method is MeshVoxelizationMethod.RAY:
+                elif bc.voxelization_method.id is MeshVoxelizationMethod("RAY").id:
                     mesh_masker = MeshMaskerRay(
                         velocity_set=DefaultConfig.velocity_set,
                         precision_policy=DefaultConfig.default_precision_policy,
                         compute_backend=DefaultConfig.default_backend,
                     )
-                elif bc.voxelization_method is MeshVoxelizationMethod.WINDING:
+                elif bc.voxelization_method.id is MeshVoxelizationMethod("WINDING").id:
                     mesh_masker = MeshMaskerWinding(
                         velocity_set=DefaultConfig.velocity_set,
                         precision_policy=DefaultConfig.default_precision_policy,
                         compute_backend=DefaultConfig.default_backend,
                     )
-                elif bc.voxelization_method is MeshVoxelizationMethod.AABB_CLOSE:
+                elif bc.voxelization_method.id is MeshVoxelizationMethod("AABB_CLOSE").id:
                     mesh_masker = MeshMaskerAABBClose(
                         velocity_set=DefaultConfig.velocity_set,
                         precision_policy=DefaultConfig.default_precision_policy,
                         compute_backend=DefaultConfig.default_backend,
+                        close_voxels=bc.voxelization_method.options.get("close_voxels"),
                     )
                 else:
                     raise ValueError(f"Unsupported voxelization method: {bc.voxelization_method}")

From e56e62aa9a26e434d218d2ac7b6711f12e8f2fb3 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 15 Sep 2025 15:40:55 -0400
Subject: [PATCH 178/208] Modified the OutletInitializer to be more
 customizable. Also added its JAX implementation.

---
 .../cuboid_flow_past_sphere_3d.py             |  8 +-
 examples/cfd/rotating_sphere_3d.py            |  8 +-
 xlb/helper/__init__.py                        |  4 +-
 xlb/helper/initializers.py                    | 87 +++++++++++++------
 xlb/operator/boundary_condition/bc_hybrid.py  | 11 ++-
 5 files changed, 77 insertions(+), 41 deletions(-)

diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
index 14ae5ffa..bfb16cbc 100644
--- a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
@@ -227,11 +227,11 @@ def bc_profile_warp(index: wp.vec3i):
 omega = 1.0 / (3.0 * visc + 0.5)
 
 # Make initializer operator
-from xlb.helper.initializers import MultiresOutletInitializer
+from xlb.helper.initializers import CustomMultiresInitializer
 
-initializer = MultiresOutletInitializer(
-    outlet_bc_id=bc_outlet.id,
-    wind_vector=(u_max, 0.0, 0.0),
+initializer = CustomMultiresInitializer(
+    bc_id=bc_outlet.id,
+    constant_velocity_vector=(u_max, 0.0, 0.0),
     velocity_set=velocity_set,
     precision_policy=precision_policy,
     compute_backend=compute_backend,
diff --git a/examples/cfd/rotating_sphere_3d.py b/examples/cfd/rotating_sphere_3d.py
index b1e29e33..450edbbd 100644
--- a/examples/cfd/rotating_sphere_3d.py
+++ b/examples/cfd/rotating_sphere_3d.py
@@ -145,11 +145,11 @@ def bc_profile_warp(index: wp.vec3i):
 )
 
 # Make initializer operator
-from xlb.helper.initializers import OutletInitializer
+from xlb.helper.initializers import CustomInitializer
 
-initializer = OutletInitializer(
-    outlet_bc_id=bc_do_nothing.id,
-    wind_vector=(wind_speed, 0.0, 0.0),
+initializer = CustomInitializer(
+    bc_id=bc_do_nothing.id,
+    constant_velocity_vector=(wind_speed, 0.0, 0.0),
     velocity_set=velocity_set,
     precision_policy=precision_policy,
     compute_backend=compute_backend,
diff --git a/xlb/helper/__init__.py b/xlb/helper/__init__.py
index 687dc547..3b1a102e 100644
--- a/xlb/helper/__init__.py
+++ b/xlb/helper/__init__.py
@@ -1,6 +1,4 @@
 from xlb.helper.nse_fields import create_nse_fields
-from xlb.helper.initializers import initialize_eq
-from xlb.helper.initializers import initialize_multires_eq
-from xlb.helper.initializers import OutletInitializer
+from xlb.helper.initializers import initialize_eq, initialize_multires_eq, CustomInitializer, CustomMultiresInitializer
 from xlb.helper.check_boundary_overlaps import check_bc_overlaps
 from xlb.helper.simulation_manager import MultiresSimulationManager
diff --git a/xlb/helper/initializers.py b/xlb/helper/initializers.py
index 20c7b779..b6f6ae7c 100644
--- a/xlb/helper/initializers.py
+++ b/xlb/helper/initializers.py
@@ -1,5 +1,6 @@
 import warp as wp
 from typing import Any
+from xlb import DefaultConfig
 from xlb.operator import Operator
 from xlb.velocity_set import VelocitySet
 from xlb.compute_backend import ComputeBackend
@@ -31,51 +32,79 @@ def initialize_eq(f, grid, velocity_set, precision_policy, compute_backend, rho=
 
 def initialize_multires_eq(f, grid, velocity_set, precision_policy, backend, rho, u):
     equilibrium = MultiresQuadraticEquilibrium()
-    equilibrium(rho, u, f, stream=0)
-    return f
+    return equilibrium(rho, u, f, stream=0)
 
 
-# Defining an initializer for outlet only
-class OutletInitializer(Operator):
+# Defining an initializer operator that initializes the entire domain or the specified BC to a constant velocity and density
+class CustomInitializer(Operator):
     def __init__(
         self,
-        outlet_bc_id: int = None,
-        wind_vector=None,
+        constant_velocity_vector=[0.0, 0.0, 0.0],
+        constant_density: float = 1.0,
+        bc_id: int = -1,
+        initialization_operator=None,
         velocity_set: VelocitySet = None,
         precision_policy=None,
         compute_backend=None,
     ):
-        assert outlet_bc_id is not None, "Outlet BC ID must be provided."
-        self.outlet_bc_id = outlet_bc_id
-        self.wind_vector = wind_vector
-        self.rho = 1.0
-        self.equilibrium = QuadraticEquilibrium(
-            velocity_set=velocity_set,
-            precision_policy=precision_policy,
-            compute_backend=ComputeBackend.WARP,
-        )
+        self.bc_id = bc_id
+        self.constant_velocity_vector = constant_velocity_vector
+        self.constant_density = constant_density
+        if initialization_operator is None:
+            compute_backend = compute_backend or DefaultConfig.default_backend
+            self.initialization_operator = QuadraticEquilibrium(
+                velocity_set=velocity_set or DefaultConfig.velocity_set,
+                precision_policy=precision_policy or DefaultConfig.precision_policy,
+                compute_backend=compute_backend if compute_backend == ComputeBackend.JAX else ComputeBackend.WARP,
+            )
         super().__init__(velocity_set, precision_policy, compute_backend)
 
+    @Operator.register_backend(ComputeBackend.JAX)
+    def jax_implementation(self, bc_mask, f_field):
+        from xlb.grid import grid_factory
+        import jax.numpy as jnp
+
+        grid_shape = f_field.shape[1:]
+        grid = grid_factory(grid_shape)
+        rho_init = grid.create_field(cardinality=1, fill_value=self.constant_density, dtype=self.precision_policy.compute_precision)
+        u_init = grid.create_field(cardinality=self.velocity_set.d, fill_value=0.0, dtype=self.precision_policy.compute_precision)
+        _vel = jnp.array(self.constant_velocity_vector)[(...,) + (None,)*self.velocity_set.d]
+        if self.bc_id == -1:
+            u_init += _vel
+        else:
+            u_init = jnp.where(bc_mask[0] == self.bc_id, u_init + _vel, u_init)
+        return self.initialization_operator(rho_init, u_init)
+
     def _construct_warp(self):
         _q = self.velocity_set.q
         _u_vec = wp.vec(self.velocity_set.d, dtype=self.compute_dtype)
-        _u = _u_vec(self.wind_vector[0], self.wind_vector[1], self.wind_vector[2])
-        _rho = self.compute_dtype(self.rho)
+        _u = _u_vec(self.constant_velocity_vector[0], self.constant_velocity_vector[1], self.constant_velocity_vector[2])
+        _rho = self.compute_dtype(self.constant_density)
         _w = self.velocity_set.w
-        outlet_bc_id = self.outlet_bc_id
+        bc_id = self.bc_id
 
         @wp.func
-        def functional(index: Any, bc_mask: Any, f_field: Any):
+        def functional_local(index: Any, bc_mask: Any, f_field: Any):
             # Check if the index corresponds to the outlet
-            if self.read_field(bc_mask, index, 0) == outlet_bc_id:
-                _feq = self.equilibrium.warp_functional(_rho, _u)
+            if self.read_field(bc_mask, index, 0) == bc_id:
+                _f_init = self.initialization_operator.warp_functional(_rho, _u)
                 for l in range(_q):
-                    self.write_field(f_field, index, l, _feq[l])
+                    self.write_field(f_field, index, l, _f_init[l])
             else:
                 # In the rest of the domain, we assume zero velocity and equilibrium distribution.
                 for l in range(_q):
                     self.write_field(f_field, index, l, _w[l])
 
+        @wp.func
+        def functional_domain(index: Any, bc_mask: Any, f_field: Any):
+            # If bc_id is -1, initialize the entire domain according to the custom initialization operator for the given velocity
+            _f_init = self.initialization_operator.warp_functional(_rho, _u)
+            for l in range(_q):
+                self.write_field(f_field, index, l, _f_init[l])
+
+        # Set the functional based on whether we are initializing a specific BC or the entire domain
+        functional = functional_local if self.bc_id != -1 else functional_domain
+
         # Construct the warp kernel
         @wp.kernel
         def kernel(
@@ -105,7 +134,7 @@ def _construct_neon(self):
         # Use the warp functional for the NEON backend
         functional, _ = self._construct_warp()
 
-        @neon.Container.factory(name="OutletInitializer")
+        @neon.Container.factory(name="CustomInitializer")
         def container(
             bc_mask: Any,
             f_field: Any,
@@ -135,22 +164,24 @@ def neon_implementation(self, bc_mask, f_field, stream=0):
 
 
 # Defining an initializer for outlet only
-class MultiresOutletInitializer(OutletInitializer):
+class CustomMultiresInitializer(CustomInitializer):
     def __init__(
         self,
-        outlet_bc_id: int = None,
-        wind_vector=None,
+        constant_velocity_vector=[0.0, 0.0, 0.0],
+        constant_density: float = 1.0,
+        bc_id: int = -1,
+        initialization_operator=None,
         velocity_set: VelocitySet = None,
         precision_policy=None,
         compute_backend=None,
     ):
-        super().__init__(outlet_bc_id, wind_vector, velocity_set, precision_policy, compute_backend)
+        super().__init__(constant_velocity_vector, constant_density, bc_id, initialization_operator, velocity_set, precision_policy, compute_backend)
 
     def _construct_neon(self):
         # Use the warp functional for the NEON backend
         functional, _ = self._construct_warp()
 
-        @neon.Container.factory(name="MultiresOutletInitializer")
+        @neon.Container.factory(name="CustomMultiresInitializer")
         def container(
             bc_mask: Any,
             f_field: Any,
diff --git a/xlb/operator/boundary_condition/bc_hybrid.py b/xlb/operator/boundary_condition/bc_hybrid.py
index 4c79a51a..e6623c0f 100644
--- a/xlb/operator/boundary_condition/bc_hybrid.py
+++ b/xlb/operator/boundary_condition/bc_hybrid.py
@@ -183,7 +183,7 @@ def _construct_profile_functional(self):
         """
 
         @wp.func
-        def profile_functional(f_1: Any, index: Any, timestep: Any):
+        def profile_functional_neon(f_1: Any, index: Any, timestep: Any):
             # Convert neon index to warp index
             warp_index = self.bc_helper.neon_index_to_warp(f_1, index)
             if wp.static(self.is_time_dependent):
@@ -191,7 +191,14 @@ def profile_functional(f_1: Any, index: Any, timestep: Any):
             else:
                 return self.profile(warp_index)
 
-        return profile_functional
+        @wp.func
+        def profile_functional_warp(f_1: Any, index: Any, timestep: Any):
+            if wp.static(self.is_time_dependent):
+                return self.profile(index, timestep)
+            else:
+                return self.profile(index)
+
+        return profile_functional_warp if self.compute_backend == ComputeBackend.WARP else profile_functional_neon
 
     def _construct_warp(self):
         # Construct the functionals for this BC

From e715fc9c1bff8b9efd157935527da39a22e0a971 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Tue, 16 Sep 2025 08:24:27 +0200
Subject: [PATCH 179/208] refactor(mesher) : remove JAX dependency for warp and
 neon backends.

Uses the `.numpy()` method directly for converting field warp data to NumPy arrays.
This avoids the needs for JAX for the warp or neon backend.
---
 xlb/utils/mesher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index 907b842f..d71caf93 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -478,7 +478,7 @@ def get_fields_data(self, field_neon_dict):
 
                 # Convert the warp fields to numpy arrays and use level's mask to filter the data
                 mask = self.levels_data[level][0]
-                field_np = np.array(wp.to_jax(self.field_warp_dict[field_name][level]))
+                field_np = self.field_warp_dict[field_name][level].numpy()
                 for card in range(cardinality):
                     field_np_card = field_np[card][mask]
                     fields_data[f"{field_name}_{card}"].append(field_np_card)

From d4447b36b69ef671ec1e1fbf206cfe31e31c69f3 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 15 Sep 2025 19:37:43 -0400
Subject: [PATCH 180/208] Added multi-res RAY mesh masker (and Neon dense)

---
 xlb/helper/initializers.py                    |   2 +-
 xlb/operator/boundary_masker/__init__.py      |   1 +
 xlb/operator/boundary_masker/aabb.py          |   1 +
 .../boundary_masker/multires_aabb_close.py    |   4 +-
 xlb/operator/boundary_masker/multires_ray.py  |  85 ++++++++++++
 xlb/operator/boundary_masker/ray.py           | 121 +++++++++++++++---
 xlb/operator/stepper/nse_multires_stepper.py  |  14 +-
 7 files changed, 205 insertions(+), 23 deletions(-)
 create mode 100644 xlb/operator/boundary_masker/multires_ray.py

diff --git a/xlb/helper/initializers.py b/xlb/helper/initializers.py
index b6f6ae7c..a1fd7107 100644
--- a/xlb/helper/initializers.py
+++ b/xlb/helper/initializers.py
@@ -68,7 +68,7 @@ def jax_implementation(self, bc_mask, f_field):
         grid = grid_factory(grid_shape)
         rho_init = grid.create_field(cardinality=1, fill_value=self.constant_density, dtype=self.precision_policy.compute_precision)
         u_init = grid.create_field(cardinality=self.velocity_set.d, fill_value=0.0, dtype=self.precision_policy.compute_precision)
-        _vel = jnp.array(self.constant_velocity_vector)[(...,) + (None,)*self.velocity_set.d]
+        _vel = jnp.array(self.constant_velocity_vector)[(...,) + (None,) * self.velocity_set.d]
         if self.bc_id == -1:
             u_init += _vel
         else:
diff --git a/xlb/operator/boundary_masker/__init__.py b/xlb/operator/boundary_masker/__init__.py
index 1cef3042..5a1ceb75 100644
--- a/xlb/operator/boundary_masker/__init__.py
+++ b/xlb/operator/boundary_masker/__init__.py
@@ -9,3 +9,4 @@
 from xlb.operator.boundary_masker.multires_aabb import MultiresMeshMaskerAABB
 from xlb.operator.boundary_masker.multires_aabb_close import MultiresMeshMaskerAABBClose
 from xlb.operator.boundary_masker.multires_indices_boundary_masker import MultiresIndicesBoundaryMasker
+from xlb.operator.boundary_masker.multires_ray import MultiresMeshMaskerRay
diff --git a/xlb/operator/boundary_masker/aabb.py b/xlb/operator/boundary_masker/aabb.py
index f0f562b3..5009e25a 100644
--- a/xlb/operator/boundary_masker/aabb.py
+++ b/xlb/operator/boundary_masker/aabb.py
@@ -104,6 +104,7 @@ def kernel(
             # Get local indices
             index = wp.vec3i(i, j, k)
 
+            # apply the functional
             functional(
                 index,
                 mesh_id,
diff --git a/xlb/operator/boundary_masker/multires_aabb_close.py b/xlb/operator/boundary_masker/multires_aabb_close.py
index 919fdb31..1693919b 100644
--- a/xlb/operator/boundary_masker/multires_aabb_close.py
+++ b/xlb/operator/boundary_masker/multires_aabb_close.py
@@ -105,9 +105,9 @@ def mres_functional_aabb(
                             dist = wp.length(pos_mesh - cell_center) - 0.5 * max_length
                             weight = dist / (max_length if max_length > 0.0 else 1.0)
                             # distances has cardinality _q; store into this channel
-                            self.write_field(distances_pn, index, direction_idx, wp.float32(weight))
+                            self.write_field(distances_pn, index, direction_idx, self.store_dtype(weight))
                         else:
-                            self.write_field(distances_pn, index, direction_idx, wp.float32(1.0))
+                            self.write_field(distances_pn, index, direction_idx, self.store_dtype(1.0))
 
         # Containers
 
diff --git a/xlb/operator/boundary_masker/multires_ray.py b/xlb/operator/boundary_masker/multires_ray.py
new file mode 100644
index 00000000..4974cfd1
--- /dev/null
+++ b/xlb/operator/boundary_masker/multires_ray.py
@@ -0,0 +1,85 @@
+import warp as wp
+from typing import Any
+from xlb.velocity_set.velocity_set import VelocitySet
+from xlb.precision_policy import PrecisionPolicy
+from xlb.compute_backend import ComputeBackend
+from xlb.operator.boundary_masker import MeshMaskerRay
+from xlb.operator.operator import Operator
+import neon
+
+
+class MultiresMeshMaskerRay(MeshMaskerRay):
+    """
+    Operator for creating a boundary missing_mask from an STL file in multiresolution simulations.
+
+    This implementation uses warp.mesh_query_ray for efficient mesh-voxel intersection testing.
+    """
+
+    def __init__(
+        self,
+        velocity_set: VelocitySet = None,
+        precision_policy: PrecisionPolicy = None,
+        compute_backend: ComputeBackend = None,
+    ):
+        # Call super
+        super().__init__(velocity_set, precision_policy, compute_backend)
+        if self.compute_backend in [ComputeBackend.JAX, ComputeBackend.WARP]:
+            raise NotImplementedError(f"Operator {self.__class__.__name__} not supported in {self.compute_backend} backend.")
+
+    def _construct_neon(self):
+        # Use the warp functional for the NEON backend
+        functional, _ = self._construct_warp()
+
+        @neon.Container.factory(name="MeshMaskerRay")
+        def container(
+            mesh_id: Any,
+            id_number: Any,
+            distances: Any,
+            bc_mask: Any,
+            missing_mask: Any,
+            needs_mesh_distance: Any,
+            level: Any,
+        ):
+            def ray_launcher(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask.get_grid(), level)
+                distances_pn = loader.get_mres_write_handle(distances)
+                bc_mask_pn = loader.get_mres_write_handle(bc_mask)
+                missing_mask_pn = loader.get_mres_write_handle(missing_mask)
+
+                @wp.func
+                def ray_kernel(index: Any):
+                    # apply the functional
+                    functional(
+                        index,
+                        mesh_id,
+                        id_number,
+                        distances_pn,
+                        bc_mask_pn,
+                        missing_mask_pn,
+                        needs_mesh_distance,
+                    )
+
+                loader.declare_kernel(ray_kernel)
+
+            return ray_launcher
+
+        return functional, container
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(
+        self,
+        bc,
+        distances,
+        bc_mask,
+        missing_mask,
+        stream=0,
+    ):
+        # Prepare inputs
+        mesh_id, bc_id = self._prepare_kernel_inputs(bc, bc_mask)
+
+        grid = bc_mask.get_grid()
+        for level in range(grid.num_levels):
+            # Launch the neon container
+            c = self.neon_container(mesh_id, bc_id, distances, bc_mask, missing_mask, wp.static(bc.needs_mesh_distance), level)
+            c.run(stream, container_runtime=neon.Container.ContainerRuntime.neon)
+        return distances, bc_mask, missing_mask
diff --git a/xlb/operator/boundary_masker/ray.py b/xlb/operator/boundary_masker/ray.py
index b1e895f0..0eb0caef 100644
--- a/xlb/operator/boundary_masker/ray.py
+++ b/xlb/operator/boundary_masker/ray.py
@@ -5,6 +5,7 @@
 from xlb.compute_backend import ComputeBackend
 from xlb.operator.boundary_masker.mesh_boundary_masker import MeshBoundaryMasker
 from xlb.operator.operator import Operator
+import neon
 
 
 class MeshMaskerRay(MeshBoundaryMasker):
@@ -27,34 +28,36 @@ def _construct_warp(self):
         _q = self.velocity_set.q
         _opp_indices = self.velocity_set.opp_indices
 
-        @wp.kernel
-        def kernel(
-            mesh_id: wp.uint64,
-            id_number: wp.int32,
-            distances: wp.array4d(dtype=Any),
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.uint8),
-            needs_mesh_distance: bool,
-        ):
-            # get index
-            i, j, k = wp.tid()
-
-            # Get local indices
-            index = wp.vec3i(i, j, k)
+        # Set local constants
+        lattice_central_index = self.velocity_set.center_index
 
+        @wp.func
+        def functional(
+            index: Any,
+            mesh_id: Any,
+            id_number: Any,
+            distances: Any,
+            bc_mask: Any,
+            missing_mask: Any,
+            needs_mesh_distance: Any,
+        ):
             # position of the point
             cell_center_pos = self.helper_masker.index_to_position(bc_mask, index)
 
             # Find the fractional distance to the mesh in each direction
-            for direction_idx in range(1, _q):
+            for direction_idx in range(_q):
+                if direction_idx == lattice_central_index:
+                    # Skip the central index as it is not relevant for boundary masking
+                    continue
+
                 direction_vec = wp.vec3f(wp.float32(_c[0, direction_idx]), wp.float32(_c[1, direction_idx]), wp.float32(_c[2, direction_idx]))
                 # Max length depends on ray direction (diagonals are longer)
                 max_length = wp.length(direction_vec)
                 query = wp.mesh_query_ray(mesh_id, cell_center_pos, direction_vec / max_length, max_length)
                 if query.result:
                     # Set the boundary id and missing_mask
-                    bc_mask[0, index[0], index[1], index[2]] = wp.uint8(id_number)
-                    missing_mask[_opp_indices[direction_idx], index[0], index[1], index[2]] = wp.uint8(True)
+                    self.write_field(bc_mask, index, 0, wp.uint8(id_number))
+                    self.write_field(missing_mask, index, _opp_indices[direction_idx], wp.uint8(True))
 
                     # If we don't need the mesh distance, we can return early
                     if not needs_mesh_distance:
@@ -64,9 +67,35 @@ def kernel(
                     pos_mesh = wp.mesh_eval_position(mesh_id, query.face, query.u, query.v)
                     dist = wp.length(pos_mesh - cell_center_pos)
                     weight = self.store_dtype(dist / max_length)
-                    distances[direction_idx, index[0], index[1], index[2]] = weight
+                    self.write_field(distances, index, direction_idx, self.store_dtype(weight))
 
-        return None, kernel
+        @wp.kernel
+        def kernel(
+            mesh_id: wp.uint64,
+            id_number: wp.int32,
+            distances: wp.array4d(dtype=Any),
+            bc_mask: wp.array4d(dtype=wp.uint8),
+            missing_mask: wp.array4d(dtype=wp.uint8),
+            needs_mesh_distance: bool,
+        ):
+            # get index
+            i, j, k = wp.tid()
+
+            # Get local indices
+            index = wp.vec3i(i, j, k)
+
+            # apply the functional
+            functional(
+                index,
+                mesh_id,
+                id_number,
+                distances,
+                bc_mask,
+                missing_mask,
+                needs_mesh_distance,
+            )
+
+        return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(
@@ -82,3 +111,57 @@ def warp_implementation(
             bc_mask,
             missing_mask,
         )
+
+    def _construct_neon(self):
+        # Use the warp functional for the NEON backend
+        functional, _ = self._construct_warp()
+
+        @neon.Container.factory(name="MeshMaskerRay")
+        def container(
+            mesh_id: Any,
+            id_number: Any,
+            distances: Any,
+            bc_mask: Any,
+            missing_mask: Any,
+            needs_mesh_distance: Any,
+        ):
+            def ray_launcher(loader: neon.Loader):
+                loader.set_grid(bc_mask.get_grid())
+                bc_mask_pn = loader.get_write_handle(bc_mask)
+                missing_mask_pn = loader.get_write_handle(missing_mask)
+                distances_pn = loader.get_write_handle(distances)
+
+                @wp.func
+                def ray_kernel(index: Any):
+                    # apply the functional
+                    functional(
+                        index,
+                        mesh_id,
+                        id_number,
+                        distances_pn,
+                        bc_mask_pn,
+                        missing_mask_pn,
+                        needs_mesh_distance,
+                    )
+
+                loader.declare_kernel(ray_kernel)
+
+            return ray_launcher
+
+        return functional, container
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(
+        self,
+        bc,
+        distances,
+        bc_mask,
+        missing_mask,
+    ):
+        # Prepare inputs
+        mesh_id, bc_id = self._prepare_kernel_inputs(bc, bc_mask)
+
+        # Launch the appropriate neon container
+        c = self.neon_container(mesh_id, bc_id, distances, bc_mask, missing_mask, wp.static(bc.needs_mesh_distance))
+        c.run(0, container_runtime=neon.Container.ContainerRuntime.neon)
+        return distances, bc_mask, missing_mask
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 85b66842..79d07c08 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -17,7 +17,13 @@
 from xlb.operator.boundary_condition.boundary_condition_registry import boundary_condition_registry
 from xlb.operator.collision import ForcedCollision
 from xlb.helper import check_bc_overlaps
-from xlb.operator.boundary_masker import MeshVoxelizationMethod, MultiresMeshMaskerAABB, MultiresMeshMaskerAABBClose, MultiresIndicesBoundaryMasker
+from xlb.operator.boundary_masker import (
+    MeshVoxelizationMethod,
+    MultiresMeshMaskerAABB,
+    MultiresMeshMaskerAABBClose,
+    MultiresIndicesBoundaryMasker,
+    MultiresMeshMaskerRay,
+)
 from xlb.operator.boundary_condition.helper_functions_bc import MultiresEncodeAuxiliaryData
 
 
@@ -236,6 +242,12 @@ def _process_boundary_conditions(cls, boundary_conditions, f_1, bc_mask, missing
                         precision_policy=DefaultConfig.default_precision_policy,
                         compute_backend=DefaultConfig.default_backend,
                     )
+                elif bc.voxelization_method.id is MeshVoxelizationMethod("RAY").id:
+                    mesh_masker = MultiresMeshMaskerRay(
+                        velocity_set=DefaultConfig.velocity_set,
+                        precision_policy=DefaultConfig.default_precision_policy,
+                        compute_backend=DefaultConfig.default_backend,
+                    )
                 elif bc.voxelization_method.id is MeshVoxelizationMethod("AABB_CLOSE").id:
                     mesh_masker = MultiresMeshMaskerAABBClose(
                         velocity_set=DefaultConfig.velocity_set,

From dfee92be9c3245e48a25b8b28c4b8bbd7f62dd17 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 22 Sep 2025 10:12:00 -0400
Subject: [PATCH 181/208] moved prepare skeleton to neon_launch and fixed occ
 none type in neon dense

---
 examples/performance/mlups_3d.py    | 2 --
 xlb/grid/neon_grid.py               | 2 +-
 xlb/operator/stepper/nse_stepper.py | 6 ++++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/performance/mlups_3d.py b/examples/performance/mlups_3d.py
index 2fd2174a..5f9ddeb8 100644
--- a/examples/performance/mlups_3d.py
+++ b/examples/performance/mlups_3d.py
@@ -185,8 +185,6 @@ def run_simulation(compute_backend, precision_policy, grid_shape, num_steps, opt
     # Initialize fields
     omega = 1.0
     f_0, f_1, bc_mask, missing_mask = stepper.prepare_fields()
-    if compute_backend == ComputeBackend.NEON:
-        stepper.prepare_skeleton(f_0, f_1, bc_mask, missing_mask, omega)
 
     warmup_iterations = 10
     # Warp-up iterations
diff --git a/xlb/grid/neon_grid.py b/xlb/grid/neon_grid.py
index 3251f43b..670fcfc5 100644
--- a/xlb/grid/neon_grid.py
+++ b/xlb/grid/neon_grid.py
@@ -19,7 +19,7 @@ def __init__(
         if backend_config is None:
             backend_config = {
                 "device_list": [0],
-                "skeleton_config": neon.SkeletonConfig.none(),
+                "skeleton_config": neon.SkeletonConfig.OCC.none(),
             }
 
         # check that the config dictionary has the required keys
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index e4c2e6fc..7e10a259 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -40,7 +40,7 @@ def __init__(
         collision_type="BGK",
         forcing_scheme="exact_difference",
         force_vector=None,
-        backend_config=None,
+        backend_config={},
     ):
         super().__init__(grid, boundary_conditions)
         self.backend_config = backend_config
@@ -532,6 +532,8 @@ def nse_stepper_cl(index: Any):
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_launch(self, f_0, f_1, bc_mask, missing_mask, omega, timestep):
+        if timestep == 0:
+            self.prepare_skeleton(f_0, f_1, bc_mask, missing_mask, omega)
         self.sk[self.sk_iter].run()
         self.sk_iter = (self.sk_iter + 1) % 2
         return f_0, f_1
@@ -544,7 +546,7 @@ def prepare_skeleton(self, f_0, f_1, bc_mask, missing_mask, omega):
         self.neon_skeleton["even"]["container"] = self.neon_container(f_1, f_0, bc_mask, missing_mask, omega, 1)
         # check if 'occ' is a valid key
         if "occ" not in self.backend_config:
-            occ = neon.SkeletonConfig.none()
+            occ = neon.SkeletonConfig.OCC.none()
         else:
             occ = self.backend_config["occ"]
             # check that occ is of type neon.SkeletonConfig.OCC

From 1dd19ebc647eff66b2359d32c5c46f333004330a Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 22 Sep 2025 11:07:40 -0400
Subject: [PATCH 182/208] added a util function to convert data in various
 backends to JAX for post-processing in single res.

---
 examples/cfd/flow_past_sphere_3d.py     |  9 +--
 examples/cfd/windtunnel_3d.py           |  5 +-
 xlb/grid/neon_grid.py                   | 45 +-----------
 xlb/operator/macroscopic/macroscopic.py |  2 +-
 xlb/utils/__init__.py                   |  1 +
 xlb/utils/mesher.py                     |  4 +-
 xlb/utils/utils.py                      | 94 +++++++++++++++++++++++++
 7 files changed, 107 insertions(+), 53 deletions(-)

diff --git a/examples/cfd/flow_past_sphere_3d.py b/examples/cfd/flow_past_sphere_3d.py
index 8758b759..1616d32a 100644
--- a/examples/cfd/flow_past_sphere_3d.py
+++ b/examples/cfd/flow_past_sphere_3d.py
@@ -80,7 +80,7 @@ def bc_profile_jax():
 
         return bc_profile_jax
 
-    elif compute_backend == ComputeBackend.WARP:
+    else:
         wp_dtype = precision_policy.compute_precision.wp_dtype
         H_y = wp_dtype(grid_shape[1] - 1)  # Height in y direction
         H_z = wp_dtype(grid_shape[2] - 1)  # Height in z direction
@@ -126,6 +126,7 @@ def bc_profile_warp(index: wp.vec3i):
     precision_policy=precision_policy,
     velocity_set=xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=ComputeBackend.JAX),
 )
+to_jax = xlb.utils.ToJAX("populations", velocity_set.q, grid_shape)
 
 # Setup Momentum Transfer for Force Calculation
 from xlb.operator.force.momentum_transfer import MomentumTransfer
@@ -136,6 +137,8 @@ def bc_profile_warp(index: wp.vec3i):
 
 # Post-Processing Function
 def post_process(step, f_0, f_1):
+    wp.synchronize()
+
     # Compute lift and drag
     boundary_force = momentum_transfer(f_0, f_1, bc_mask, missing_mask)
     drag = boundary_force[0]  # x-direction
@@ -146,7 +149,7 @@ def post_process(step, f_0, f_1):
 
     # Convert to JAX array if necessary
     if not isinstance(f_0, jnp.ndarray):
-        f_0 = wp.to_jax(f_0)
+        f_0 = to_jax(f_0)
         wp.synchronize()
 
     rho, u = macro(f_0)
@@ -177,8 +180,6 @@ def post_process(step, f_0, f_1):
     f_0, f_1 = f_1, f_0  # Swap the buffers
 
     if step % post_process_interval == 0 or step == num_steps - 1:
-        if compute_backend == ComputeBackend.WARP:
-            wp.synchronize()
         post_process(step, f_0, f_1)
         end_time = time.time()
         elapsed = end_time - start_time
diff --git a/examples/cfd/windtunnel_3d.py b/examples/cfd/windtunnel_3d.py
index 3786c4db..2af47064 100644
--- a/examples/cfd/windtunnel_3d.py
+++ b/examples/cfd/windtunnel_3d.py
@@ -189,7 +189,7 @@ def post_process(
     """
     # Convert to JAX array if necessary
     if not isinstance(f_0, jnp.ndarray):
-        f_0_jax = wp.to_jax(f_0)
+        f_0_jax = to_jax(f_0)
     else:
         f_0_jax = f_0
 
@@ -235,6 +235,7 @@ def post_process(
     precision_policy=precision_policy,
     velocity_set=xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=ComputeBackend.JAX),
 )
+to_jax = xlb.utils.ToJAX("populations", velocity_set.q, grid_shape)
 
 # Initialize Lists to Store Coefficients and Time Steps
 time_steps = []
@@ -251,7 +252,7 @@ def post_process(
 
     # Print progress at intervals
     if step % print_interval == 0:
-        if compute_backend == ComputeBackend.WARP:
+        if compute_backend in [ComputeBackend.WARP, ComputeBackend.NEON]:
             wp.synchronize()
         elapsed_time = time.time() - start_time
         print(f"Iteration: {step}/{num_steps} | Time elapsed: {elapsed_time:.2f}s")
diff --git a/xlb/grid/neon_grid.py b/xlb/grid/neon_grid.py
index 670fcfc5..e775535d 100644
--- a/xlb/grid/neon_grid.py
+++ b/xlb/grid/neon_grid.py
@@ -1,9 +1,8 @@
-import warp as wp
 import neon
 from .grid import Grid
 from xlb.precision_policy import Precision
 from xlb.compute_backend import ComputeBackend
-from typing import Literal, List
+from typing import Literal
 from xlb import DefaultConfig
 
 
@@ -40,7 +39,6 @@ def __init__(
         self.dim = None
         self.grid = None
         self.velocity_set = velocity_set
-        self.warp_grid = WarpGrid(shape)
 
         super().__init__(shape, ComputeBackend.NEON)
 
@@ -90,46 +88,5 @@ def create_field(
             field.fill_run(value=fill_value, stream_idx=0)
         return field
 
-    def _create_warp_field(
-        self, cardinality: int, dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None, fill_value=None, ne_field=None
-    ):
-        warp_field = self.warp_grid.create_field(cardinality, dtype, fill_value)
-        if ne_field is None:
-            return warp_field
-
-        _d = self.velocity_set.d
-
-        import typing
-
-        @neon.Container.factory
-        def container(src_field: typing.Any, dst_field: typing.Any, cardinality: wp.int32):
-            def loading_step(loader: neon.Loader):
-                loader.declare_execution_scope(self.grid)
-                src_pn = loader.get_read_handel(src_field)
-
-                @wp.func
-                def cloning(gridIdx: typing.Any):
-                    cIdx = wp.neon_global_idx(src_pn, gridIdx)
-                    gx = wp.neon_get_x(cIdx)
-                    gy = wp.neon_get_y(cIdx)
-                    gz = wp.neon_get_z(cIdx)
-
-                    # XLB is flattening the z dimension in 3D, while neon uses the y dimension
-                    if _d == 2:
-                        gy, gz = gz, gy
-
-                    for card in range(cardinality):
-                        value = wp.neon_read(src_pn, gridIdx, card)
-                        dst_field[card, gx, gy, gz] = value
-
-                loader.declare_kernel(cloning)
-
-            return loading_step
-
-        c = container(src_field=ne_field, dst_field=warp_field, cardinality=cardinality)
-        c.run(0)
-        wp.synchronize()
-        return warp_field
-
     def get_neon_backend(self):
         return self.bk
diff --git a/xlb/operator/macroscopic/macroscopic.py b/xlb/operator/macroscopic/macroscopic.py
index 61a6ef88..00d0076c 100644
--- a/xlb/operator/macroscopic/macroscopic.py
+++ b/xlb/operator/macroscopic/macroscopic.py
@@ -20,7 +20,7 @@ def __init__(self, *args, **kwargs):
 
     @Operator.register_backend(ComputeBackend.JAX)
     @partial(jit, static_argnums=(0), inline=True)
-    def jax_implementation(self, f):
+    def jax_implementation(self, f, rho=None, u=None):
         rho = self.zero_moment(f)
         u = self.first_moment(f, rho)
         return rho, u
diff --git a/xlb/utils/__init__.py b/xlb/utils/__init__.py
index 7af8f80c..6c31e90c 100644
--- a/xlb/utils/__init__.py
+++ b/xlb/utils/__init__.py
@@ -6,5 +6,6 @@
     rotate_geometry,
     voxelize_stl,
     axangle2mat,
+    ToJAX,
 )
 from .mesher import make_cuboid_mesh, MultiresIO
diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index d71caf93..43304e38 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -189,7 +189,7 @@ def process_geometry(self, levels_data, scale):
 
         for level_idx, (data, voxel_size, origin, level) in enumerate(levels_data):
             origin = origin * voxel_size
-            corners_list, conn_list, _ = self._process_level(data, voxel_size, origin, level, point_id_offsets[level_idx])
+            corners_list, conn_list = self._process_level(data, voxel_size, origin, level, point_id_offsets[level_idx])
 
             if corners_list:
                 print(f"\tProcessing level {level}: Voxel size {voxel_size * scale}, Origin {origin}, Shape {data.shape}")
@@ -231,7 +231,7 @@ def _process_level(self, data, voxel_size, origin, level, point_id_offset):
             all_connectivity.append(connectivity)
             pid_offset += len(chunk) * 8
 
-        return all_corners, all_connectivity, level
+        return all_corners, all_connectivity
 
     def _process_voxel_chunk(self, true_indices, origin, voxel_size, point_id_offset):
         """
diff --git a/xlb/utils/utils.py b/xlb/utils/utils.py
index 0a9858a5..a3aed2af 100644
--- a/xlb/utils/utils.py
+++ b/xlb/utils/utils.py
@@ -314,3 +314,97 @@ def axangle2mat(axis, angle, is_normalized=False):
         [xyC + zs, y * yC + c, yzC - xs],
         [zxC - ys, yzC + xs, z * zC + c],
     ])
+
+
+class ToJAX(object):
+    def __init__(self, field_name, field_cardinality, grid_shape, store_precision=None):
+        """
+        Initialize the MultiresIO object.
+
+        Parameters
+        ----------
+        field_name : str
+            The name of the field to be converted.
+        field_cardinality : int
+            The cardinality of the field to be converted.
+        grid_shape : tuple
+            The shape of the grid on which the field is defined.
+        store_precision : str, optional
+            The precision policy for storing data.
+        """
+        from xlb.compute_backend import ComputeBackend
+        from xlb.grid import grid_factory
+        from xlb import DefaultConfig
+
+        # Assign to self
+        self.field_name = field_name
+        self.field_cardinality = field_cardinality
+        self.grid_shape = grid_shape
+        self.compute_backend = DefaultConfig.default_backend
+        self.velocity_set = DefaultConfig.velocity_set
+        if store_precision is None:
+            self.store_precision = DefaultConfig.default_precision_policy.store_precision
+            self.store_dtype = DefaultConfig.default_precision_policy.store_precision.wp_dtype
+
+        if self.compute_backend == ComputeBackend.NEON:
+            # Allocate warp fields for copying neon fields
+            # Use the warp backend to create dense fields for copying NEON dGrid fields
+            grid_dense = grid_factory(grid_shape, compute_backend=ComputeBackend.WARP)
+            self.warp_field = grid_dense.create_field(cardinality=self.field_cardinality, dtype=self.store_precision)
+
+    def copy_neon_to_warp(self, neon_field):
+        """Convert a dense neon field to a warp field by copying."""
+        import warp as wp
+        import neon
+        from typing import Any
+
+        assert neon_field.get_grid().name == "dGrid", "to_warp only supports dense grids"
+        _d = self.velocity_set.d
+
+        @neon.Container.factory("to_warp")
+        def container(src_field: Any, dst_field: Any, cardinality: wp.int32):
+            def loading_step(loader: neon.Loader):
+                loader.set_grid(src_field.get_grid())
+                src_pn = loader.get_read_handle(src_field)
+
+                @wp.func
+                def cloning(gridIdx: Any):
+                    cIdx = wp.neon_global_idx(src_pn, gridIdx)
+                    gx = wp.neon_get_x(cIdx)
+                    gy = wp.neon_get_y(cIdx)
+                    gz = wp.neon_get_z(cIdx)
+
+                    # XLB is flattening the z dimension in 3D, while neon uses the y dimension
+                    if _d == 2:
+                        gy, gz = gz, gy
+
+                    for card in range(cardinality):
+                        value = wp.neon_read(src_pn, gridIdx, card)
+                        dst_field[card, gx, gy, gz] = value
+
+                loader.declare_kernel(cloning)
+
+            return loading_step
+
+        cardinality = neon_field.cardinality
+        c = container(neon_field, self.warp_field, cardinality)
+        c.run(0)
+        wp.synchronize()
+        return self.warp_field
+
+    def __call__(self, field):
+        from xlb.compute_backend import ComputeBackend
+        import warp as wp
+
+        if self.compute_backend == ComputeBackend.JAX:
+            return field
+        elif self.compute_backend == ComputeBackend.WARP:
+            return wp.to_jax(field)
+        elif self.compute_backend == ComputeBackend.NEON:
+            assert field.cardinality == self.field_cardinality, (
+                f"Field cardinality mismatch! Expected {self.field_cardinality}, got {field.cardinality}!"
+            )
+            return wp.to_jax(self.copy_neon_to_warp(field))
+
+        else:
+            raise ValueError("Unsupported compute backend!")

From 12aaa09a3c6284397687b79d5bf287d2469cb149 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 22 Sep 2025 15:14:00 -0400
Subject: [PATCH 183/208] added extrapolation outflow BC to Neon

---
 .../bc_extrapolation_outflow.py               | 46 ++++++++++++++++++-
 .../boundary_condition/boundary_condition.py  | 33 ++++++-------
 xlb/operator/stepper/nse_stepper.py           |  2 +-
 3 files changed, 61 insertions(+), 20 deletions(-)

diff --git a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
index 4574f3a0..9c7476b7 100644
--- a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
+++ b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
@@ -63,6 +63,8 @@ def __init__(
         # Unpack the two warp functionals needed for this BC!
         if self.compute_backend == ComputeBackend.WARP:
             self.warp_functional, self.assemble_auxiliary_data = self.warp_functional
+        elif self.compute_backend == ComputeBackend.NEON:
+            self.neon_functional, self.assemble_auxiliary_data = self.neon_functional
 
     def _get_normal_vectors(self, indices):
         # Get the frequency count and most common element directly
@@ -173,7 +175,7 @@ def functional(
             return _f
 
         @wp.func
-        def assemble_auxiliary_data(
+        def assemble_auxiliary_data_warp(
             index: Any,
             timestep: Any,
             missing_mask: Any,
@@ -199,7 +201,40 @@ def assemble_auxiliary_data(
                     _f[_opp_indices[l]] = (self.compute_dtype(1.0) - sound_speed) * _f_pre[l] + sound_speed * f_aux
             return _f
 
+        @wp.func
+        def assemble_auxiliary_data_neon(
+            index: Any,
+            timestep: Any,
+            missing_mask: Any,
+            f_0: Any,
+            f_1: Any,
+            _f_pre: Any,
+            _f_post: Any,
+            level: Any = 0,
+        ):
+            # Prepare time-dependent dynamic data for imposing the boundary condition in the next iteration after streaming.
+            # We use directions that leave the domain for storing this prepared data.
+            # Since this function is called post-collisiotn: f_pre = f_post_stream and f_post = f_post_collision
+            _f = _f_post
+            nv = get_normal_vectors(missing_mask)
+            for lattice_dir in range(self.velocity_set.q):
+                if missing_mask[lattice_dir] == wp.uint8(1):
+                    # f_0 is the post-collision values of the current time-step
+                    # Get pull index associated with the "neighbours" pull_index
+                    offset = wp.vec3i(-_c[0, lattice_dir], -_c[1, lattice_dir], -_c[2, lattice_dir])
+                    for d in range(self.velocity_set.d):
+                        offset[d] = offset[d] - nv[d]
+                    offset_pull_index = wp.neon_ngh_idx(wp.int8(offset[0]), wp.int8(offset[1]), wp.int8(offset[2]))
+
+                    # The following is the post-streaming values of the neighbor cell
+                    # This function reads a field value at a given neighboring index and direction.
+                    unused_is_valid = wp.bool(False)
+                    f_aux = self.compute_dtype(wp.neon_read_ngh(f_0, index, offset_pull_index, lattice_dir, self.compute_dtype(0.0), unused_is_valid))
+                    _f[_opp_indices[lattice_dir]] = (self.compute_dtype(1.0) - sound_speed) * _f_pre[lattice_dir] + sound_speed * f_aux
+            return _f
+
         kernel = self._construct_kernel(functional)
+        assemble_auxiliary_data = assemble_auxiliary_data_warp if self.compute_backend == ComputeBackend.WARP else assemble_auxiliary_data_neon
 
         return (functional, assemble_auxiliary_data), kernel
 
@@ -212,3 +247,12 @@ def warp_implementation(self, _f_pre, _f_post, bc_mask, missing_mask):
             dim=_f_pre.shape[1:],
         )
         return _f_post
+
+    def _construct_neon(self):
+        functional, _ = self._construct_warp()
+        return functional, None
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(self, f_pre, f_post, bc_mask, missing_mask):
+        # rise exception as this feature is not implemented yet
+        raise NotImplementedError("This feature is not implemented in XLB with the NEON backend yet.")
diff --git a/xlb/operator/boundary_condition/boundary_condition.py b/xlb/operator/boundary_condition/boundary_condition.py
index a949ef35..9041fbae 100644
--- a/xlb/operator/boundary_condition/boundary_condition.py
+++ b/xlb/operator/boundary_condition/boundary_condition.py
@@ -78,25 +78,22 @@ def __init__(
         # Currently we support three methods based on (a) aabb method (b) ray casting and (c) winding number.
         self.voxelization_method = voxelization_method
 
-        if self.compute_backend == ComputeBackend.WARP:
-            # Set local constants TODO: This is a hack and should be fixed with warp update
-            _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
-            _missing_mask_vec = wp.vec(self.velocity_set.q, dtype=wp.uint8)  # TODO fix vec bool
-
-        @wp.func
-        def assemble_auxiliary_data(
-            index: Any,
-            timestep: Any,
-            missing_mask: Any,
-            f_0: Any,
-            f_1: Any,
-            f_pre: Any,
-            f_post: Any,
-        ):
-            return f_post
+        # Construct a default warp functional for assembling auxiliary data if needed
+        if self.compute_backend in [ComputeBackend.WARP, ComputeBackend.NEON]:
+
+            @wp.func
+            def assemble_auxiliary_data(
+                index: Any,
+                timestep: Any,
+                missing_mask: Any,
+                f_0: Any,
+                f_1: Any,
+                f_pre: Any,
+                f_post: Any,
+                level: Any = 0,
+            ):
+                return f_post
 
-        # Construct some helper warp functions for getting tid data
-        if self.compute_backend == ComputeBackend.WARP:
             self.assemble_auxiliary_data = assemble_auxiliary_data
 
     def pad_indices(self):
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 7e10a259..1b71b5e9 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -420,7 +420,7 @@ def apply_bc(
                             f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep, _missing_mask, f_0, f_1, f_pre, f_post)
                     if wp.static(self.boundary_conditions[i].id in extrapolation_outflow_bc_ids):
                         if _boundary_id == wp.static(self.boundary_conditions[i].id):
-                            f_result = wp.static(self.boundary_conditions[i].assemble_dynamic_data)(
+                            f_result = wp.static(self.boundary_conditions[i].assemble_auxiliary_data)(
                                 index, timestep, _missing_mask, f_0, f_1, f_pre, f_post
                             )
             return f_result

From 48df9dbd3c9e201b34c2a421e6f8a6c62daeba5b Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 22 Sep 2025 15:32:25 -0400
Subject: [PATCH 184/208] updated wind tunnel example with Neon on multi-GPU

---
 examples/cfd/windtunnel_3d.py           | 8 +++++++-
 xlb/operator/boundary_masker/winding.py | 3 +++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/examples/cfd/windtunnel_3d.py b/examples/cfd/windtunnel_3d.py
index 2af47064..2d7ecf1f 100644
--- a/examples/cfd/windtunnel_3d.py
+++ b/examples/cfd/windtunnel_3d.py
@@ -21,6 +21,8 @@
 import matplotlib.pyplot as plt
 from xlb.operator.boundary_masker import MeshVoxelizationMethod
 
+import neon
+
 # -------------------------- Simulation Setup --------------------------
 
 # Grid parameters
@@ -28,7 +30,7 @@
 grid_shape = (grid_size_x, grid_size_y, grid_size_z)
 
 # Simulation Configuration
-compute_backend = ComputeBackend.WARP
+compute_backend = ComputeBackend.NEON
 precision_policy = PrecisionPolicy.FP32FP32
 
 velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=compute_backend)
@@ -108,11 +110,15 @@
 boundary_conditions = [bc_walls, bc_left, bc_do_nothing, bc_car]
 
 
+# Configure backend options:
+backend_config = {"occ": neon.SkeletonConfig.OCC.from_string("standard"), "device_list": [0, 1]} if compute_backend == ComputeBackend.NEON else {}
+
 # Setup Stepper
 stepper = IncompressibleNavierStokesStepper(
     grid=grid,
     boundary_conditions=boundary_conditions,
     collision_type="KBC",
+    backend_config=backend_config,
 )
 
 # Prepare Fields
diff --git a/xlb/operator/boundary_masker/winding.py b/xlb/operator/boundary_masker/winding.py
index 9e1ae722..b44edc2d 100644
--- a/xlb/operator/boundary_masker/winding.py
+++ b/xlb/operator/boundary_masker/winding.py
@@ -20,6 +20,9 @@ def __init__(
     ):
         # Call super
         super().__init__(velocity_set, precision_policy, compute_backend)
+        assert self.compute_backend != ComputeBackend.NEON, (
+            'MeshVoxelizationMethod("WINDING") is not implemented in Neon yet! Please use a different method of mesh voxelization!'
+        )
 
     def _construct_warp(self):
         # Make constants for warp

From e37c8d6357723fda62e6da6751ac62075916808b Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 22 Sep 2025 17:09:43 -0400
Subject: [PATCH 185/208] fixed the call to assemble_auxiliary_data for mres.

---
 xlb/operator/stepper/nse_multires_stepper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 79d07c08..63ff560d 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -324,7 +324,7 @@ def apply_bc(
                             f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep, _missing_mask, f_0, f_1, f_pre, f_post)
                     if wp.static(self.boundary_conditions[i].id in extrapolation_outflow_bc_ids):
                         if _boundary_id == wp.static(self.boundary_conditions[i].id):
-                            f_result = wp.static(self.boundary_conditions[i].prepare_bc_auxilary_data)(
+                            f_result = wp.static(self.boundary_conditions[i].assemble_auxiliary_data)(
                                 index, timestep, _missing_mask, f_0, f_1, f_pre, f_post
                             )
             return f_result

From 831c51e332cb2ef26c71e04db8109cfe8bb81f9b Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Mon, 6 Oct 2025 19:07:14 +0200
Subject: [PATCH 186/208] refactoring(perf-test): adding more options to the
 performance tests

---
 examples/performance/mlups_3d.py          |   6 ++
 examples/performance/mlups_3d_multires.py | 122 ++++++++++++++++++----
 2 files changed, 109 insertions(+), 19 deletions(-)

diff --git a/examples/performance/mlups_3d.py b/examples/performance/mlups_3d.py
index 5f9ddeb8..b6f2aeca 100644
--- a/examples/performance/mlups_3d.py
+++ b/examples/performance/mlups_3d.py
@@ -439,8 +439,14 @@ def print_scalability_summary(args, stats_list):
 
 def report(args, stats):
     import neon
+    import sys
 
     report = neon.Report("LBM MLUPS LDC")
+    
+    # Save the full command line
+    command_line = " ".join(sys.argv)
+    report.add_member("command_line", command_line)
+    
     report.add_member("velocity_set", args.velocity_set.__class__.__name__)
     report.add_member("compute_backend", args.compute_backend.name)
     report.add_member("precision_policy", args.precision)
diff --git a/examples/performance/mlups_3d_multires.py b/examples/performance/mlups_3d_multires.py
index f5df5fc3..06a938f8 100644
--- a/examples/performance/mlups_3d_multires.py
+++ b/examples/performance/mlups_3d_multires.py
@@ -13,18 +13,57 @@
 
 
 def parse_arguments():
-    parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation (BGK)")
+    parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation with Multi-resolution Grid")
     # Positional arguments
     parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid")
     parser.add_argument("num_steps", type=int, help="Timestep for the simulation")
     parser.add_argument("compute_backend", type=str, help="Backend for the simulation (jax, warp or neon)")
     parser.add_argument("precision", type=str, help="Precision for the simulation (e.g., fp32/fp32)")
+    parser.add_argument("num_levels", type=int, help="Number of levels for the multiresolution grid")
 
     # Optional arguments
     parser.add_argument("--num_devices", type=int, default=0, help="Number of devices for the simulation (default: 0)")
-    parser.add_argument("--velocity_set", type=str, default="D3Q19", help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)")
+    parser.add_argument("--velocity_set", type=str, default="D3Q19",
+                        help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)")
+    parser.add_argument("--collision_model", type=str, default="BGK",
+                        help="Collision model: BGK or KBC (default: BGK)")
 
-    return parser.parse_args()
+    parser.add_argument("--report", action="store_true", help="Generate a neon report file (default: disabled)")
+    parser.add_argument("--export_final_velocity", action="store_true",
+                        help="Export the final velocity field to a vti file (default: disabled)")
+
+    args = parser.parse_args()
+    
+    print_args(args)
+
+    if args.compute_backend != "neon":
+        raise ValueError("Invalid compute backend specified. Use 'neon' which supports multi-resolution!")
+
+    if args.collision_model not in ["BGK", "KBC"]:
+        raise ValueError("Invalid collision model specified. Use 'BGK' or 'KBC'.")
+
+    return args
+
+
+def print_args(args):
+    # Print simulation configuration
+    print("=" * 60)
+    print("           3D LATTICE BOLTZMANN SIMULATION CONFIG")
+    print("=" * 60)
+    print(f"Grid Size:            {args.cube_edge}³ ({args.cube_edge:,} × {args.cube_edge:,} × {args.cube_edge:,})")
+    print(f"Total Lattice Points: {args.cube_edge ** 3:,}")
+    print(f"Time Steps:           {args.num_steps:,}")
+    print(f"Number Levels:        {args.num_levels}")
+    print(f"Compute Backend:      {args.compute_backend}")
+    print(f"Precision Policy:     {args.precision}")
+    print(f"Velocity Set:         {args.velocity_set}")
+    print(f"Collision Model:      {args.collision_model}")
+    print(f"Generate Report:      {'Yes' if args.report else 'No'}")
+    print(f"Export Velocity:      {'Yes' if args.export_final_velocity else 'No'}")
+
+    print("=" * 60)
+    print("Starting simulation...")
+    print()
 
 
 def setup_simulation(args):
@@ -61,7 +100,7 @@ def setup_simulation(args):
     return velocity_set
 
 
-def problem1(grid_shape, velocity_set):
+def problem1(grid_shape, velocity_set, num_levels):
     def peel(dim, idx, peel_level, outwards):
         if outwards:
             xIn = idx.x <= peel_level or idx.x >= dim.x - 1 - peel_level
@@ -77,7 +116,7 @@ def peel(dim, idx, peel_level, outwards):
     dim = neon.Index_3d(grid_shape[0], grid_shape[1], grid_shape[2])
 
     def get_peeled_np(level, width):
-        divider = 2**level
+        divider = 2 ** level
         m = neon.Index_3d(dim.x // divider, dim.y // divider, dim.z // divider)
         if level == 0:
             m = dim
@@ -101,14 +140,13 @@ def get_levels(num_levels):
             l = get_peeled_np(i, 8)
             levels.append(l)
         lastLevel = num_levels - 1
-        divider = 2**lastLevel
+        divider = 2 ** lastLevel
         m = neon.Index_3d(dim.x // divider + 1, dim.y // divider + 1, dim.z // divider + 1)
         lastLevel = np.ones((m.x, m.y, m.z), dtype=int)
         lastLevel = np.ascontiguousarray(lastLevel, dtype=np.int32)
         levels.append(lastLevel)
         return levels
 
-    num_levels = 4
     levels = get_levels(num_levels)
 
     grid = multires_grid_factory(
@@ -121,7 +159,8 @@ def get_levels(num_levels):
     box = grid.bounding_box_indices()
     box_no_edge = grid.bounding_box_indices(remove_edges=True)
     lid = box_no_edge["top"]
-    walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in range(len(grid.shape))]
+    walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in
+             range(len(grid.shape))]
     walls = np.unique(np.array(walls), axis=-1).tolist()
     # convert bc indices to a list of list, where the first entry of the list corresponds to the finest level
     lid = [lid] + [[] for _ in range(num_levels - 1)]
@@ -129,14 +168,13 @@ def get_levels(num_levels):
     return grid, lid, walls
 
 
-def problem2(grid_shape, velocity_set):
+def problem2(grid_shape, velocity_set, num_levels):
     # Example 2: Coarsest at the edges (2 level only)
-    num_levels = 4
     level_origins = []
     level_list = []
     for lvl in range(num_levels):
-        divider = 2**lvl
-        growth = 1.5**lvl
+        divider = 2 ** lvl
+        growth = 1.5 ** lvl
         shape = grid_shape[0] // divider, grid_shape[1] // divider, grid_shape[2] // divider
         if lvl == num_levels - 1:
             level = np.ascontiguousarray(np.ones(shape, dtype=int), dtype=np.int32)
@@ -159,7 +197,8 @@ def problem2(grid_shape, velocity_set):
     box = grid.bounding_box_indices(shape=grid.level_to_shape(num_levels - 1))
     box_no_edge = grid.bounding_box_indices(shape=grid.level_to_shape(1), remove_edges=True)
     lid = box_no_edge["top"]
-    walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in range(len(grid.shape))]
+    walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in
+             range(len(grid.shape))]
     walls = np.unique(np.array(walls), axis=-1).tolist()
     # convert bc indices to a list of list, where the first entry of the list corresponds to the finest level
     lid = [[] for _ in range(num_levels - 1)] + [lid]
@@ -167,7 +206,13 @@ def problem2(grid_shape, velocity_set):
     return grid, lid, walls
 
 
-def run(velocity_set, grid_shape, num_steps):
+def run(velocity_set,
+        grid_shape,
+        num_steps,
+        num_levels,
+        collision_model,
+        export_final_velocity
+        ):
     # Create grid and setup boundary conditions
 
     # Convert indices to list of indices per level
@@ -177,10 +222,10 @@ def run(velocity_set, grid_shape, num_steps):
     # walls = construct_indices_per_level(grid_shape, walls, levels_mask, level_origins)
 
     # Example 1: fine to coarse
-    # grid, lid, walls = problem1(grid_shape, velocity_set)
+    # grid, lid, walls = problem1(grid_shape, velocity_set, num_levels)
 
     # Example 2: Coarse to fine:
-    grid, lid, walls = problem2(grid_shape, velocity_set)
+    grid, lid, walls = problem1(grid_shape, velocity_set, num_levels)
 
     prescribed_vel = 0.1
     boundary_conditions = [
@@ -195,7 +240,10 @@ def run(velocity_set, grid_shape, num_steps):
     omega = 1.0 / (3.0 * visc + 0.5)
 
     # Define a multi-resolution simulation manager
-    sim = xlb.helper.MultiresSimulationManager(omega=omega, grid=grid, boundary_conditions=boundary_conditions, collision_type="KBC")
+    sim = xlb.helper.MultiresSimulationManager(omega=omega,
+                                               grid=grid,
+                                               boundary_conditions=boundary_conditions,
+                                               collision_type=collision_model)
 
     # sim.export_macroscopic("Initial_")
     # sim.step()
@@ -212,6 +260,9 @@ def run(velocity_set, grid_shape, num_steps):
     t = time.time() - start_time
     print(f"Timing  {t}")
 
+    if export_final_velocity:
+        sim.export_macroscopic("u_lid_driven_cavity_")
+
     # sim.export_macroscopic("u_lid_driven_cavity_")
     num_levels = grid.count_levels
     return {"time": t, "num_levels": num_levels}
@@ -219,7 +270,7 @@ def run(velocity_set, grid_shape, num_steps):
 
 def calculate_mlups(cube_edge, num_steps, elapsed_time, num_levels):
     num_step_finer = num_steps * 2 ** (num_levels - 1)
-    total_lattice_updates = cube_edge**3 * num_step_finer
+    total_lattice_updates = cube_edge ** 3 * num_step_finer
     mlups = (total_lattice_updates / elapsed_time) / 1e6
     return {"EMLUPS": mlups, "finer_steps": num_step_finer}
 
@@ -236,11 +287,40 @@ def calculate_mlups(cube_edge, num_steps, elapsed_time, num_levels):
     # save_image(fields["u_magnitude"][:, ny//2, :], timestep=i, prefix="lid_driven_cavity")
 
 
+def generate_report(args, stats, mlups_stats):
+    """Generate a neon report file with simulation parameters and results"""
+    import neon
+    import sys
+
+    report = neon.Report("LBM MLUPS Multiresolution LDC")
+    
+    # Save the full command line
+    command_line = " ".join(sys.argv)
+    report.add_member("command_line", command_line)
+    
+    report.add_member("velocity_set", args.velocity_set)
+    report.add_member("compute_backend", args.compute_backend)
+    report.add_member("precision_policy", args.precision)
+    report.add_member("collision_model", args.collision_model)
+    report.add_member("grid_size", args.cube_edge)
+    report.add_member("num_steps", args.num_steps)
+    report.add_member("num_levels", stats["num_levels"])
+    report.add_member("finer_steps", mlups_stats["finer_steps"])
+    
+    # Performance metrics
+    report.add_member("elapsed_time", stats["time"])
+    report.add_member("emlups", mlups_stats["EMLUPS"])
+    
+    report_name = f"mlups_3d_multires_size_{args.cube_edge}_levels_{stats['num_levels']}"
+    report.write(report_name, True)
+    print("Report generated successfully.")
+
+
 def main():
     args = parse_arguments()
     velocity_set = setup_simulation(args)
     grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
-    stats = run(velocity_set, grid_shape, args.num_steps)
+    stats = run(velocity_set, grid_shape, args.num_steps, args.num_levels, args.collision_model, args.export_final_velocity)
     mlups_stats = calculate_mlups(args.cube_edge, args.num_steps, stats["time"], stats["num_levels"])
 
     print(f"Simulation completed in {stats['time']:.2f} seconds")
@@ -252,6 +332,10 @@ def main():
     EMLUPS = mlups_stats["EMLUPS"]
     print(f"EMLUPs: {EMLUPS:.2f}")
 
+    # Generate report if requested
+    if args.report:
+        generate_report(args, stats, mlups_stats)
+
 
 if __name__ == "__main__":
     main()

From dfacaa2c6a49f3b2606375d2222edb4daa165e6d Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Tue, 7 Oct 2025 12:55:32 +0200
Subject: [PATCH 187/208] refactoring(CLI): improving CLI for the performance
 tests

refactoring(CLI): improving CLI for the performance tests
---
 examples/performance/mlups_3d.py          | 263 +++++++++++-----------
 examples/performance/mlups_3d_multires.py |  74 ++++--
 2 files changed, 189 insertions(+), 148 deletions(-)

diff --git a/examples/performance/mlups_3d.py b/examples/performance/mlups_3d.py
index b6f2aeca..2b7a0ea1 100644
--- a/examples/performance/mlups_3d.py
+++ b/examples/performance/mlups_3d.py
@@ -16,32 +16,57 @@
 
 
 def parse_arguments():
-    parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation (BGK)")
-    parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid")
-    parser.add_argument("num_steps", type=int, help="Number of timesteps for the simulation")
-    parser.add_argument("compute_backend", type=str, help="Backend for the simulation (jax, warp or neon)")
-    parser.add_argument("precision", type=str, help="Precision for the simulation (e.g., fp32/fp32)")
-    parser.add_argument(
-        "--gpu_devices",
-        type=str,
-        default=None,
-        help="List of the CUDA devices to use (e.g., --gpu_devices=[0,1,2]). This is only used for Neon backend.",
-    )
-    # add a flat to choose between 19 or 27 velocity set
-    parser.add_argument("--velocity_set", type=str, default="D3Q19", help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)")
-    # add a flat to choose between multi-gpu occ options based on the neon occ:
-    parser.add_argument(
-        "--occ", type=str, default="standard", help="Overlapping Communication and Computation option (standard, none) (default: standard)"
-    )
-    parser.add_argument("--report", action="store_true", help="Generate a neon report file (default: disabled)")
-    parser.add_argument("--export_final_velocity", action="store_true", help="Export the final velocity field to a vti file (default: disabled)")
-    parser.add_argument("--measure_scalability", action="store_true", help="Measure scalability of the simulation (default: disabled)")
-    parser.add_argument(
-        "--repetitions",
-        type=int,
-        default=1,
-        help="Number of repetitions for the simulation (default: 1) to get the average MLUPs and standard deviation",
+    # Define valid options for consistency
+    COMPUTE_BACKENDS = ["neon", "warp", "jax"]
+    PRECISION_OPTIONS = ["fp32/fp32", "fp64/fp64", "fp64/fp32", "fp32/fp16"]
+    VELOCITY_SETS = ["D3Q19", "D3Q27"]
+    COLLISION_MODELS = ["BGK", "KBC"]
+    OCC_OPTIONS = ["standard", "none"]
+    
+    parser = argparse.ArgumentParser(
+        description="MLUPS Benchmark for 3D Lattice Boltzmann Method Simulation",
+        epilog=f"""
+Examples:
+  %(prog)s 100 1000 neon fp32/fp32
+  %(prog)s 200 500 neon fp64/fp64 --collision_model KBC --velocity_set D3Q27
+  %(prog)s 150 2000 neon fp32/fp32 --gpu_devices=[0,1,2] --measure_scalability --report
+  %(prog)s 100 1000 neon fp32/fp32 --repetitions 5 --export_final_velocity
+        """,
+        formatter_class=argparse.RawDescriptionHelpFormatter
     )
+    
+    # Positional arguments
+    parser.add_argument("cube_edge", type=int, 
+                       help="Length of the edge of the cubic grid (e.g., 100)")
+    parser.add_argument("num_steps", type=int, 
+                       help="Number of timesteps for the simulation (e.g., 1000)")
+    parser.add_argument("compute_backend", type=str, 
+                       choices=COMPUTE_BACKENDS,
+                       help=f"Backend for the simulation ({', '.join(COMPUTE_BACKENDS)})")
+    parser.add_argument("precision", type=str, 
+                       choices=PRECISION_OPTIONS,
+                       help=f"Precision for the simulation ({', '.join(PRECISION_OPTIONS)})")
+    
+    # Optional arguments
+    parser.add_argument("--gpu_devices", type=str, default=None,
+                       help="CUDA devices to use for Neon backend (e.g., [0,1,2] or [0])")
+    parser.add_argument("--velocity_set", type=str, default="D3Q19", 
+                       choices=VELOCITY_SETS,
+                       help=f"Lattice velocity set (default: D3Q19, choices: {', '.join(VELOCITY_SETS)})")
+    parser.add_argument("--collision_model", type=str, default="BGK", 
+                       choices=COLLISION_MODELS,
+                       help=f"Collision model (default: BGK, choices: {', '.join(COLLISION_MODELS)}, KBC requires D3Q27)")
+    parser.add_argument("--occ", type=str, default="standard", 
+                       choices=OCC_OPTIONS,
+                       help=f"Overlapping Communication and Computation strategy (default: standard, choices: {', '.join(OCC_OPTIONS)})")
+    parser.add_argument("--report", action="store_true", 
+                       help="Generate Neon performance report")
+    parser.add_argument("--export_final_velocity", action="store_true", 
+                       help="Export final velocity field to VTI file")
+    parser.add_argument("--measure_scalability", action="store_true", 
+                       help="Measure performance across different GPU counts")
+    parser.add_argument("--repetitions", type=int, default=1, metavar="N",
+                       help="Number of simulation repetitions for statistical analysis (default: 1)")
 
     args = parser.parse_args()
 
@@ -56,29 +81,31 @@ def parse_arguments():
         except (ValueError, SyntaxError):
             raise ValueError("Invalid gpu_devices format. Use format like [0,1,2] or [0]")
 
-    # Checking the compute backend and covert it to the right type
-    compute_backend = None
-    if args.compute_backend == "jax":
-        compute_backend = ComputeBackend.JAX
-    elif args.compute_backend == "warp":
-        compute_backend = ComputeBackend.WARP
-    elif args.compute_backend == "neon":
-        compute_backend = ComputeBackend.NEON
-    else:
-        raise ValueError("Invalid compute backend specified. Use 'jax', 'warp', or 'neon'.")
+    # Validate and convert compute backend
+    compute_backend_map = {
+        "jax": ComputeBackend.JAX,
+        "warp": ComputeBackend.WARP,
+        "neon": ComputeBackend.NEON,
+    }
+    compute_backend = compute_backend_map.get(args.compute_backend)
+    if compute_backend is None:
+        raise ValueError(f"Invalid compute backend '{args.compute_backend}'. Use: {', '.join(COMPUTE_BACKENDS)}")
     args.compute_backend = compute_backend
 
-    # Checking OCC
-    if args.occ not in ["standard", "none"]:
-        raise ValueError("Invalid occupancy option. Use 'standard', or 'none'.")
-    if args.gpu_devices is None and args.compute_backend == ComputeBackend.NEON:
-        print("[Warning] No GPU devices specified. Using default device 0.")
-        args.gpu_devices = [0]
+    # Handle GPU devices for Neon backend
     if args.compute_backend == ComputeBackend.NEON:
+        if args.gpu_devices is None:
+            print("[INFO] No GPU devices specified. Using default device 0.")
+            args.gpu_devices = [0]
+        
         import neon
-
-        occ = neon.SkeletonConfig.OCC.from_string(args.occ)
-        args.occ = occ
+        occ_enum = neon.SkeletonConfig.OCC.from_string(args.occ)
+        args.occ_enum = occ_enum  # Store the enum for Neon
+        args.occ_display = args.occ  # Store the original string for display
+    else:
+        if args.gpu_devices is not None:
+            raise ValueError(f"--gpu_devices can only be used with Neon backend, not {args.compute_backend.name}")
+        args.gpu_devices = [0]  # Default for non-Neon backends
 
     # Checking precision policy
     precision_policy_map = {
@@ -89,12 +116,12 @@ def parse_arguments():
     }
     precision_policy = precision_policy_map.get(args.precision)
     if precision_policy is None:
-        raise ValueError("Invalid precision specified.")
+        raise ValueError(f"Invalid precision '{args.precision}'. Use: {', '.join(PRECISION_OPTIONS)}")
     args.precision_policy = precision_policy
 
-    # Checking velocity set
-    if args.velocity_set not in ["D3Q19", "D3Q27"]:
-        raise ValueError("Invalid velocity set. Use 'D3Q19' or 'D3Q27'.")
+    # Validate collision model and velocity set compatibility
+    if args.collision_model == "KBC" and args.velocity_set != "D3Q27":
+        raise ValueError("KBC collision model requires D3Q27 velocity set. Use --velocity_set D3Q27")
 
     if args.velocity_set == "D3Q19":
         velocity_set = xlb.velocity_set.D3Q19(precision_policy=args.precision_policy, compute_backend=compute_backend)
@@ -102,11 +129,6 @@ def parse_arguments():
         velocity_set = xlb.velocity_set.D3Q27(precision_policy=args.precision_policy, compute_backend=compute_backend)
     args.velocity_set = velocity_set
 
-    if args.gpu_devices is not None and args.compute_backend != ComputeBackend.NEON:
-        raise ValueError("--gpu_devices can be used only with the Neon backend.")
-
-    if args.gpu_devices is None:
-        args.gpu_devices = [0]
 
     print_args(args)
 
@@ -114,30 +136,39 @@ def parse_arguments():
 
 
 def print_args(args):
-    # Print simulation configuration
-    print("=" * 60)
-    print("           3D LATTICE BOLTZMANN SIMULATION CONFIG")
-    print("=" * 60)
-    print(f"Grid Size:            {args.cube_edge}³ ({args.cube_edge:,} × {args.cube_edge:,} × {args.cube_edge:,})")
-    print(f"Total Lattice Points: {args.cube_edge**3:,}")
-    print(f"Time Steps:           {args.num_steps:,}")
-    print(f"Compute Backend:      {args.compute_backend.name}")
-    print(f"Precision Policy:     {args.precision}")
-    print(f"Velocity Set:         {args.velocity_set.__class__.__name__}")
-    print(f"Generate Report:      {'Yes' if args.report else 'No'}")
-    print(f"Measure Scalability:  {'Yes' if args.measure_scalability else 'No'}")
-    print(f"Export Velocity:      {'Yes' if args.export_final_velocity else 'No'}")
-    print(f"Repetitions:          {args.repetitions}")
-
+    """Print simulation configuration in a clean, organized format"""
+    print("\n" + "=" * 70)
+    print("                    SIMULATION CONFIGURATION")
+    print("=" * 70)
+    
+    # Grid and simulation parameters
+    print("GRID & SIMULATION:")
+    print(f"  Grid Size:              {args.cube_edge}³ ({args.cube_edge:,} × {args.cube_edge:,} × {args.cube_edge:,})")
+    print(f"  Total Lattice Points:   {args.cube_edge**3:,}")
+    print(f"  Time Steps:             {args.num_steps:,}")
+    print(f"  Repetitions:            {args.repetitions}")
+    
+    # Computational settings
+    print("\nCOMPUTATIONAL SETTINGS:")
+    print(f"  Compute Backend:        {args.compute_backend.name}")
+    print(f"  Precision Policy:       {args.precision}")
+    print(f"  Velocity Set:           {args.velocity_set.__class__.__name__}")
+    print(f"  Collision Model:        {args.collision_model}")
+    
+    # Backend-specific settings
     if args.compute_backend.name == "NEON":
-        print(f"GPU Devices:          {args.gpu_devices}")
-        # Convert the neon OCC enum back to string for display
-        occ_display = args.occ.to_string() if hasattr(args.occ, "__class__") else args.occ
-        print(f"OCC Strategy:         {occ_display}")
+        print("\nNEON BACKEND SETTINGS:")
+        print(f"  GPU Devices:            {args.gpu_devices}")
+        print(f"  OCC Strategy:           {args.occ_display}")
+    
+    # Output options
+    print("\nOUTPUT OPTIONS:")
+    print(f"  Generate Report:        {'Yes' if args.report else 'No'}")
+    print(f"  Measure Scalability:    {'Yes' if args.measure_scalability else 'No'}")
+    print(f"  Export Velocity:        {'Yes' if args.export_final_velocity else 'No'}")
 
-    print("=" * 60)
-    print("Starting simulation...")
-    print()
+    print("=" * 70)
+    print("Starting simulation...\n")
 
 
 def init_xlb(args):
@@ -148,12 +179,12 @@ def init_xlb(args):
     )
     options = None
     if args.compute_backend == ComputeBackend.NEON:
-        neon_options = {"occ": args.occ, "device_list": args.gpu_devices}
+        neon_options = {"occ": args.occ_enum, "device_list": args.gpu_devices}
         options = neon_options
     return args.compute_backend, args.precision_policy, options
 
 
-def run_simulation(compute_backend, precision_policy, grid_shape, num_steps, options, export_final_velocity, repetitions, num_devices):
+def run_simulation(compute_backend, precision_policy, grid_shape, num_steps, options, export_final_velocity, repetitions, num_devices, collision_model):
     grid = grid_factory(grid_shape, backend_config=options)
     box = grid.bounding_box_indices()
     box_no_edge = grid.bounding_box_indices(remove_edges=True)
@@ -170,7 +201,7 @@ def run_simulation(compute_backend, precision_policy, grid_shape, num_steps, opt
     stepper = IncompressibleNavierStokesStepper(
         grid=grid,
         boundary_conditions=boundary_conditions,
-        collision_type="BGK",
+        collision_type=collision_model,
         backend_config=options,
     )
 
@@ -231,51 +262,6 @@ def calculate_mlups(cube_edge, num_steps, elapsed_time):
     mlups = (total_lattice_updates / elapsed_time) / 1e6
     return mlups
 
-
-def print_summary(args, elapsed_time, mlups):
-    """Print comprehensive simulation summary with parameters and performance results"""
-    total_lattice_points = args.cube_edge**3
-    total_lattice_updates = total_lattice_points * args.num_steps
-    lattice_points_per_second = total_lattice_updates / elapsed_time
-
-    print("\n\n\n" + "=" * 70)
-    print("                    SIMULATION SUMMARY")
-    print("=" * 70)
-
-    # Simulation Parameters
-    print("SIMULATION PARAMETERS:")
-    print("-" * 25)
-    print(f"  Grid Size:              {args.cube_edge}³ ({args.cube_edge:,} × {args.cube_edge:,} × {args.cube_edge:,})")
-    print(f"  Total Lattice Points:   {total_lattice_points:,}")
-    print(f"  Time Steps:             {args.num_steps:,}")
-    print(f"  Total Lattice Updates:  {total_lattice_updates:,}")
-    print(f"  Compute Backend:        {args.compute_backend.name}")
-    print(f"  Precision Policy:       {args.precision}")
-    print(f"  Velocity Set:           {args.velocity_set.__class__.__name__}")
-    print(f"  Generate Report:        {'Yes' if args.report else 'No'}")
-    print(f"  Measure Scalability:    {'Yes' if args.measure_scalability else 'No'}")
-
-    if args.compute_backend.name == "NEON":
-        print(f"  GPU Devices:            {args.gpu_devices}")
-        occ_display = str(args.occ).split(".")[-1] if hasattr(args.occ, "__class__") else args.occ
-        print(f"  OCC Strategy:           {occ_display}")
-
-    print()
-
-    # Performance Results
-    print("PERFORMANCE RESULTS:")
-    print("-" * 20)
-    print(f"  Time in main loop:      {elapsed_time:.3f} seconds")
-    print(f"  MLUPs:                  {mlups:.2f}")
-    print(f"  Time per LBM step:      {elapsed_time / args.num_steps * 1000:.3f} ms")
-
-    if args.compute_backend.name == "NEON" and len(args.gpu_devices) > 1:
-        mlups_per_gpu = mlups / len(args.gpu_devices)
-        print(f"  MLUPs per GPU:          {mlups_per_gpu:.2f}")
-
-    print("=" * 70)
-
-
 def print_summary_with_stats(args, stats):
     """Print comprehensive simulation summary with statistics from multiple repetitions"""
     total_lattice_points = args.cube_edge**3
@@ -301,12 +287,13 @@ def print_summary_with_stats(args, stats):
     print(f"  Compute Backend:        {args.compute_backend.name}")
     print(f"  Precision Policy:       {args.precision}")
     print(f"  Velocity Set:           {args.velocity_set.__class__.__name__}")
+    print(f"  Collision Model:        {args.collision_model}")
     print(f"  Generate Report:        {'Yes' if args.report else 'No'}")
     print(f"  Measure Scalability:    {'Yes' if args.measure_scalability else 'No'}")
 
     if args.compute_backend.name == "NEON":
         print(f"  GPU Devices:            {args.gpu_devices}")
-        occ_display = str(args.occ).split(".")[-1] if hasattr(args.occ, "__class__") else args.occ
+        occ_display = args.occ_display
         print(f"  OCC Strategy:           {occ_display}")
 
     print()
@@ -350,7 +337,6 @@ def print_summary_with_stats(args, stats):
 
     print("=" * 70)
 
-
 def print_scalability_summary(args, stats_list):
     """Print comprehensive scalability summary with MLUPs statistics for different GPU counts"""
     total_lattice_points = args.cube_edge**3
@@ -371,9 +357,10 @@ def print_scalability_summary(args, stats_list):
     print(f"  Compute Backend:        {args.compute_backend.name}")
     print(f"  Precision Policy:       {args.precision}")
     print(f"  Velocity Set:           {args.velocity_set.__class__.__name__}")
+    print(f"  Collision Model:        {args.collision_model}")
 
     if args.compute_backend.name == "NEON":
-        occ_display = str(args.occ).split(".")[-1] if hasattr(args.occ, "__class__") else args.occ
+        occ_display = args.occ_display
         print(f"  OCC Strategy:           {occ_display}")
         print(f"  Available GPU Devices:  {args.gpu_devices}")
 
@@ -436,7 +423,6 @@ def print_scalability_summary(args, stats_list):
 
     print("=" * 95)
 
-
 def report(args, stats):
     import neon
     import sys
@@ -450,6 +436,7 @@ def report(args, stats):
     report.add_member("velocity_set", args.velocity_set.__class__.__name__)
     report.add_member("compute_backend", args.compute_backend.name)
     report.add_member("precision_policy", args.precision)
+    report.add_member("collision_model", args.collision_model)
     report.add_member("grid_size", args.cube_edge)
     report.add_member("num_steps", args.num_steps)
     report.add_member("repetitions", args.repetitions)
@@ -469,16 +456,27 @@ def report(args, stats):
     report.add_member("elapsed_time", stats["mean_elapsed_time"])
     report.add_member("mlups", stats["mean_mlups"])
 
-    report.add_member("occ", (args.occ.to_string()))
+    report.add_member("occ", args.occ_display)
     report.add_member_vector("gpu_devices", args.gpu_devices)
     report.add_member("num_devices", len(args.gpu_devices))
     report.add_member("measure_scalability", args.measure_scalability)
 
-    report_name = "mlups_3d_" + f"size_{args.cube_edge}"
-    if args.measure_scalability:
-        report_name += f"_dev_{len(args.gpu_devices)}"
+    # Generate report name following the convention: script_name + parameters
+    report_name = "mlups_3d"
+    report_name += f"_velocity_set_{args.velocity_set.__class__.__name__}"
+    report_name += f"_compute_backend_{args.compute_backend.name}"
+    report_name += f"_precision_policy_{args.precision.replace('/', '_')}"
+    report_name += f"_collision_model_{args.collision_model}"
+    report_name += f"_grid_size_{args.cube_edge}"
+    report_name += f"_num_steps_{args.num_steps}"
+    
+    if args.compute_backend.name == "NEON":
+        report_name += f"_occ_{args.occ_display}"
+        report_name += f"_num_devices_{len(args.gpu_devices)}"
+    
     if args.repetitions > 1:
-        report_name += f"_rep_{args.repetitions}"
+        report_name += f"_repetitions_{args.repetitions}"
+    
     report.write(report_name, True)
 
 
@@ -500,6 +498,7 @@ def benchmark(args):
         export_final_velocity=args.export_final_velocity,
         repetitions=args.repetitions,
         num_devices=len(args.gpu_devices),
+        collision_model=args.collision_model,
     )
 
     for elapsed_time in elapsed_time_list:
diff --git a/examples/performance/mlups_3d_multires.py b/examples/performance/mlups_3d_multires.py
index 06a938f8..6048dddf 100644
--- a/examples/performance/mlups_3d_multires.py
+++ b/examples/performance/mlups_3d_multires.py
@@ -10,16 +10,34 @@
 from xlb.grid import multires_grid_factory
 from xlb.operator.stepper import MultiresIncompressibleNavierStokesStepper
 from xlb.operator.boundary_condition import FullwayBounceBackBC, EquilibriumBC
+from xlb.mres_perf_optimization_type import MresPerfOptimizationType
 
 
 def parse_arguments():
-    parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation with Multi-resolution Grid")
+    parser = argparse.ArgumentParser(
+        description="MLUPS for 3D Lattice Boltzmann Method Simulation with Multi-resolution Grid",
+        epilog="""
+Examples:
+  %(prog)s 100 1000 neon fp32/fp32 2 NAIVE_COLLIDE_STREAM
+  %(prog)s 200 500 neon fp64/fp64 3 FUSION_AT_FINEST --report
+  %(prog)s 50 2000 neon fp32/fp16 2 NAIVE_COLLIDE_STREAM --export_final_velocity
+
+Valid values:
+  compute_backend: neon
+  precision: fp32/fp32, fp64/fp64, fp64/fp32, fp32/fp16
+  mres_perf_opt: NAIVE_COLLIDE_STREAM, FUSION_AT_FINEST
+  velocity_set: D3Q19, D3Q27
+  collision_model: BGK, KBC
+        """,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+    
     # Positional arguments
-    parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid")
-    parser.add_argument("num_steps", type=int, help="Timestep for the simulation")
-    parser.add_argument("compute_backend", type=str, help="Backend for the simulation (jax, warp or neon)")
-    parser.add_argument("precision", type=str, help="Precision for the simulation (e.g., fp32/fp32)")
-    parser.add_argument("num_levels", type=int, help="Number of levels for the multiresolution grid")
+    parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid (e.g., 100)")
+    parser.add_argument("num_steps", type=int, help="Number of timesteps for the simulation (e.g., 1000)")
+    parser.add_argument("compute_backend", type=str, help="Backend for the simulation (neon)")
+    parser.add_argument("precision", type=str, help="Precision for the simulation (fp32/fp32, fp64/fp64, fp64/fp32, fp32/fp16)")
+    parser.add_argument("num_levels", type=int, help="Number of levels for the multiresolution grid (e.g., 2)")
+    parser.add_argument("mres_perf_opt", type=MresPerfOptimizationType.from_string, help="Multi-resolution performance optimization strategy (NAIVE_COLLIDE_STREAM, FUSION_AT_FINEST)")
 
     # Optional arguments
     parser.add_argument("--num_devices", type=int, default=0, help="Number of devices for the simulation (default: 0)")
@@ -32,8 +50,24 @@ def parse_arguments():
     parser.add_argument("--export_final_velocity", action="store_true",
                         help="Export the final velocity field to a vti file (default: disabled)")
 
-    args = parser.parse_args()
-    
+    try:
+        args = parser.parse_args()
+    except SystemExit:
+        # Re-raise with custom message
+        print("\n" + "="*60)
+        print("USAGE EXAMPLES:")
+        print("="*60)
+        print("python mlups_3d_multires.py 100 1000 neon fp32/fp32 2 NAIVE_COLLIDE_STREAM")
+        print("python mlups_3d_multires.py 200 500 neon fp64/fp64 3 FUSION_AT_FINEST --report")
+        print("\nVALID VALUES:")
+        print("  compute_backend: neon")
+        print("  precision: fp32/fp32, fp64/fp64, fp64/fp32, fp32/fp16")
+        print("  mres_perf_opt: NAIVE_COLLIDE_STREAM, FUSION_AT_FINEST")
+        print("  velocity_set: D3Q19, D3Q27")
+        print("  collision_model: BGK, KBC")
+        print("="*60)
+        raise
+
     print_args(args)
 
     if args.compute_backend != "neon":
@@ -58,6 +92,7 @@ def print_args(args):
     print(f"Precision Policy:     {args.precision}")
     print(f"Velocity Set:         {args.velocity_set}")
     print(f"Collision Model:      {args.collision_model}")
+    print(f"Mres Perf Opt:        {args.mres_perf_opt}")
     print(f"Generate Report:      {'Yes' if args.report else 'No'}")
     print(f"Export Velocity:      {'Yes' if args.export_final_velocity else 'No'}")
 
@@ -211,8 +246,8 @@ def run(velocity_set,
         num_steps,
         num_levels,
         collision_model,
-        export_final_velocity
-        ):
+        export_final_velocity,
+        mres_perf_opt, ):
     # Create grid and setup boundary conditions
 
     # Convert indices to list of indices per level
@@ -243,7 +278,8 @@ def run(velocity_set,
     sim = xlb.helper.MultiresSimulationManager(omega=omega,
                                                grid=grid,
                                                boundary_conditions=boundary_conditions,
-                                               collision_type=collision_model)
+                                               collision_type=collision_model,
+                                               mres_perf_opt=mres_perf_opt, )
 
     # sim.export_macroscopic("Initial_")
     # sim.step()
@@ -251,6 +287,10 @@ def run(velocity_set,
     print("start timing")
     wp.synchronize()
     start_time = time.time()
+
+    if num_levels == 1:
+        num_steps = num_steps // 2 
+    
     for i in range(num_steps):
         sim.step()
         # if i % 1000 == 0:
@@ -293,11 +333,11 @@ def generate_report(args, stats, mlups_stats):
     import sys
 
     report = neon.Report("LBM MLUPS Multiresolution LDC")
-    
+
     # Save the full command line
     command_line = " ".join(sys.argv)
     report.add_member("command_line", command_line)
-    
+
     report.add_member("velocity_set", args.velocity_set)
     report.add_member("compute_backend", args.compute_backend)
     report.add_member("precision_policy", args.precision)
@@ -306,11 +346,11 @@ def generate_report(args, stats, mlups_stats):
     report.add_member("num_steps", args.num_steps)
     report.add_member("num_levels", stats["num_levels"])
     report.add_member("finer_steps", mlups_stats["finer_steps"])
-    
+
     # Performance metrics
     report.add_member("elapsed_time", stats["time"])
     report.add_member("emlups", mlups_stats["EMLUPS"])
-    
+
     report_name = f"mlups_3d_multires_size_{args.cube_edge}_levels_{stats['num_levels']}"
     report.write(report_name, True)
     print("Report generated successfully.")
@@ -320,7 +360,9 @@ def main():
     args = parse_arguments()
     velocity_set = setup_simulation(args)
     grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
-    stats = run(velocity_set, grid_shape, args.num_steps, args.num_levels, args.collision_model, args.export_final_velocity)
+    stats = run(velocity_set, grid_shape, args.num_steps, args.num_levels, args.collision_model,
+                args.export_final_velocity,
+                mres_perf_opt = args.mres_perf_opt)
     mlups_stats = calculate_mlups(args.cube_edge, args.num_steps, stats["time"], stats["num_levels"])
 
     print(f"Simulation completed in {stats['time']:.2f} seconds")

From 368e2f332d581031f2dcab6d566fb357e6401426 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Thu, 9 Oct 2025 10:15:49 -0400
Subject: [PATCH 188/208] Omega must vary across resolutions to maintain
 viscosity as a constant

---
 .../cuboid_flow_past_sphere_3d.py             |  4 +-
 .../grid_refinement/flow_past_sphere_3d.py    |  4 +-
 examples/performance/mlups_3d_multires.py     |  4 +-
 xlb/helper/simulation_manager.py              | 40 ++++++++++++++++---
 4 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
index bfb16cbc..fec3559f 100644
--- a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
@@ -224,7 +224,7 @@ def bc_profile_warp(index: wp.vec3i):
 
 # Configure the simulation relaxation time
 visc = u_max * num_finest_voxels_across_part / Re
-omega = 1.0 / (3.0 * visc + 0.5)
+omega_finest = 1.0 / (3.0 * visc + 0.5)
 
 # Make initializer operator
 from xlb.helper.initializers import CustomMultiresInitializer
@@ -239,7 +239,7 @@ def bc_profile_warp(index: wp.vec3i):
 
 # Define a multi-resolution simulation manager
 sim = xlb.helper.MultiresSimulationManager(
-    omega=omega,
+    omega_finest=omega_finest,
     grid=grid,
     boundary_conditions=boundary_conditions,
     collision_type="KBC",
diff --git a/examples/cfd/grid_refinement/flow_past_sphere_3d.py b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
index 399b2907..30738351 100644
--- a/examples/cfd/grid_refinement/flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
@@ -145,11 +145,11 @@ def bc_profile_warp(index: wp.vec3i):
 
 # Configure the simulation relaxation time
 visc = 2.0 * u_max * sphere_radius / Re
-omega = 1.0 / (3.0 * visc + 0.5)
+omega_finest = 1.0 / (3.0 * visc + 0.5)
 
 # Define a multi-resolution simulation manager
 sim = xlb.helper.MultiresSimulationManager(
-    omega=omega,
+    omega_finest=omega_finest,
     grid=grid,
     boundary_conditions=boundary_conditions,
     collision_type="BGK",
diff --git a/examples/performance/mlups_3d_multires.py b/examples/performance/mlups_3d_multires.py
index f5df5fc3..cff15eb1 100644
--- a/examples/performance/mlups_3d_multires.py
+++ b/examples/performance/mlups_3d_multires.py
@@ -192,10 +192,10 @@ def run(velocity_set, grid_shape, num_steps):
     Re = 5000.0
     clength = grid_shape[0] - 1
     visc = prescribed_vel * clength / Re
-    omega = 1.0 / (3.0 * visc + 0.5)
+    omega_finest = 1.0 / (3.0 * visc + 0.5)
 
     # Define a multi-resolution simulation manager
-    sim = xlb.helper.MultiresSimulationManager(omega=omega, grid=grid, boundary_conditions=boundary_conditions, collision_type="KBC")
+    sim = xlb.helper.MultiresSimulationManager(omega_finest=omega_finest, grid=grid, boundary_conditions=boundary_conditions, collision_type="KBC")
 
     # sim.export_macroscopic("Initial_")
     # sim.step()
diff --git a/xlb/helper/simulation_manager.py b/xlb/helper/simulation_manager.py
index a9c994d0..8abc56b6 100644
--- a/xlb/helper/simulation_manager.py
+++ b/xlb/helper/simulation_manager.py
@@ -12,7 +12,7 @@ class MultiresSimulationManager(MultiresIncompressibleNavierStokesStepper):
 
     def __init__(
         self,
-        omega,
+        omega_finest,
         grid,
         boundary_conditions=[],
         collision_type="BGK",
@@ -24,8 +24,8 @@ def __init__(
         super().__init__(grid, boundary_conditions, collision_type, forcing_scheme, force_vector)
 
         self.initializer = initializer
-        self.omega = omega
         self.count_levels = grid.count_levels
+        self.omega_list = [self.compute_omega(omega_finest, level) for level in range(self.count_levels)]
         self.mres_perf_opt = mres_perf_opt
         # Create fields
         self.rho = grid.create_field(cardinality=1, dtype=self.precision_policy.store_precision)
@@ -51,6 +51,27 @@ def __init__(
         # Construct the stepper skeleton
         self._construct_stepper_skeleton()
 
+    def compute_omega(self, omega_finest, level):
+        """
+        Compute the relaxation parameter omega at a given grid level based on the finest level omega.
+        We select a refinement ratio of 2 where a coarse cell at level L is uniformly divided into 2^d cells
+        where d is the dimension. to arrive at level L - 1, or in other words ∆x_{L-1} = ∆x_L/2.
+        For neighboring cells that interface two grid levels, a maximum jump in grid level of ∆L = 1 is
+        allowed. Due to acoustic scaling which requires the speed of sound cs to remain constant across various grid levels,
+        ∆tL ∝ ∆xL and hence ∆t_{L-1} = ∆t_{L}/2. In addition, the fluid viscosity \nu must also remain constant on each
+        grid level which leads to the following relationship for the relaxation parameter omega at grid level L base
+        on the finest grid level omega_finest.
+
+        Args:
+            omega_finest: Relaxation parameter at the finest grid level.
+            level: Current grid level (0-indexed, with 0 being the finest level).
+
+        Returns:
+            Relaxation parameter omega at the specified grid level.
+        """
+        omega0 = omega_finest
+        return 2 ** (level + 1) * omega0 / ((2**level - 1.0) * omega0 + 2.0)
+
     def export_macroscopic(self, fname_prefix):
         print(f"exporting macroscopic: #levels {self.count_levels}")
         self.macro(self.f_0, self.bc_mask, self.rho, self.u, streamId=0)
@@ -74,6 +95,10 @@ def _construct_stepper_skeleton(self):
         def recursion_reference(level, app):
             if level < 0:
                 return
+
+            # Compute omega at the current level
+            omega = self.omega_list[level]
+
             print(f"RECURSION down to level {level}")
             print(f"RECURSION Level {level}, COLLIDE")
 
@@ -85,7 +110,7 @@ def recursion_reference(level, app):
                 f_1=self.f_1,
                 bc_mask=self.bc_mask,
                 missing_mask=self.missing_mask,
-                omega=self.omega,
+                omega=omega,
                 timestep=0,
             )
 
@@ -110,6 +135,9 @@ def recursion_fused_finest(level, app):
             if level < 0:
                 return
 
+            # Compute omega at the current level
+            omega = self.omega_list[level]
+
             if level == 0:
                 print(f"RECURSION down to the finest level {level}")
                 print(f"RECURSION Level {level}, Fused STREAM and COLLIDE")
@@ -121,7 +149,7 @@ def recursion_fused_finest(level, app):
                     f_1=self.f_1,
                     bc_mask=self.bc_mask,
                     missing_mask=self.missing_mask,
-                    omega=self.omega,
+                    omega=omega,
                     timestep=0,
                     is_f1_the_explosion_src_field=True,
                 )
@@ -133,7 +161,7 @@ def recursion_fused_finest(level, app):
                     f_1=self.f_0,
                     bc_mask=self.bc_mask,
                     missing_mask=self.missing_mask,
-                    omega=self.omega,
+                    omega=omega,
                     timestep=0,
                     is_f1_the_explosion_src_field=False,
                 )
@@ -150,7 +178,7 @@ def recursion_fused_finest(level, app):
                 f_1=self.f_1,
                 bc_mask=self.bc_mask,
                 missing_mask=self.missing_mask,
-                omega=self.omega,
+                omega=omega,
                 timestep=0,
             )
             # 1. Accumulation is read from f_0 in the streaming step, where f_0=self.f_1.

From a1ac799088970ac762d2c2ed3e9401816e04c912 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Thu, 2 Oct 2025 13:25:44 -0400
Subject: [PATCH 189/208] Added a utility class for unit conversion.

---
 .../cuboid_flow_past_sphere_3d.py             | 27 +-----
 xlb/utils/__init__.py                         |  1 +
 xlb/utils/mesher.py                           | 82 +++++++++++++----
 xlb/utils/utils.py                            | 89 +++++++++++++++++++
 4 files changed, 154 insertions(+), 45 deletions(-)

diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
index bfb16cbc..ab3686ca 100644
--- a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
@@ -17,7 +17,7 @@
     HybridBC,
 )
 from xlb.operator.boundary_masker import MeshVoxelizationMethod
-from xlb.utils.mesher import make_cuboid_mesh
+from xlb.utils.mesher import make_cuboid_mesh, prepare_sparsity_pattern
 from xlb.operator.force import MultiresMomentumTransfer
 
 
@@ -82,31 +82,6 @@ def generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part):
     return level_data, mesh_vertices, tuple([int(a) for a in grid_shape_finest])
 
 
-def prepare_sparsity_pattern(level_data):
-    """
-    Prepare the sparsity pattern for the multiresolution grid based on the level data. "level_data" is expected to be formatted as in
-    the output of "make_cuboid_mesh".
-    """
-    num_levels = len(level_data)
-    sparsity_pattern = []
-    level_origins = []
-    sparsity_pattern = []
-    for lvl in range(num_levels):
-        # Get the level mask from the level data
-        level_mask = level_data[lvl][0]
-
-        # Ensure level_0 is contiguous int32
-        level_mask = np.ascontiguousarray(level_mask, dtype=np.int32)
-
-        # Append the padded level mask to the sparsity pattern
-        sparsity_pattern.append(level_mask)
-
-        # Get the origin for this level
-        level_origins.append(level_data[lvl][2])
-
-    return sparsity_pattern, level_origins
-
-
 # -------------------------- Simulation Setup --------------------------
 
 # The following parameters define the resolution of the voxelized grid
diff --git a/xlb/utils/__init__.py b/xlb/utils/__init__.py
index 6c31e90c..87aebef3 100644
--- a/xlb/utils/__init__.py
+++ b/xlb/utils/__init__.py
@@ -7,5 +7,6 @@
     voxelize_stl,
     axangle2mat,
     ToJAX,
+    UnitConvertor,
 )
 from .mesher import make_cuboid_mesh, MultiresIO
diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index 43304e38..20fcb092 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -1,9 +1,10 @@
 import numpy as np
 import trimesh
-from typing import Any
+from typing import Any, Optional
 
 import neon
 import warp as wp
+from xlb.utils.utils import UnitConvertor
 
 
 def adjust_bbox(cuboid_max, cuboid_min, voxel_size_up):
@@ -23,6 +24,30 @@ def adjust_bbox(cuboid_max, cuboid_min, voxel_size_up):
     return adjusted_min, adjusted_max
 
 
+def prepare_sparsity_pattern(level_data):
+    """
+    Prepare the sparsity pattern for the multiresolution grid based on the level data. "level_data" is expected to be formatted as in
+    the output of "make_cuboid_mesh".
+    """
+    num_levels = len(level_data)
+    level_origins = []
+    sparsity_pattern = []
+    for lvl in range(num_levels):
+        # Get the level mask from the level data
+        level_mask = level_data[lvl][0]
+
+        # Ensure level_0 is contiguous int32
+        level_mask = np.ascontiguousarray(level_mask, dtype=np.int32)
+
+        # Append the padded level mask to the sparsity pattern
+        sparsity_pattern.append(level_mask)
+
+        # Get the origin for this level
+        level_origins.append(level_data[lvl][2])
+
+    return sparsity_pattern, level_origins
+
+
 def make_cuboid_mesh(voxel_size, cuboids, stl_filename):
     """
     Create a strongly-balanced multi-level cuboid mesh with a sequence of bounding boxes.
@@ -125,7 +150,14 @@ def make_cuboid_mesh(voxel_size, cuboids, stl_filename):
 
 
 class MultiresIO(object):
-    def __init__(self, field_name_cardinality_dict, levels_data, scale=1, offset=(0.0, 0.0, 0.0), store_precision=None):
+    def __init__(
+        self,
+        field_name_cardinality_dict,
+        levels_data,
+        unit_convertor: UnitConvertor = None,
+        offset: Optional[tuple] = (0.0, 0.0, 0.0),
+        store_precision=None,
+    ):
         """
         Initialize the MultiresIO object.
 
@@ -136,15 +168,18 @@ def __init__(self, field_name_cardinality_dict, levels_data, scale=1, offset=(0.
             Example: {'velocity_x': 1, 'velocity_y': 1, 'velocity': 3, 'density': 1}
         levels_data : list of tuples
             Each tuple contains (data, voxel_size, origin, level).
-        scale : float or tuple, optional
-            Scale factor for the coordinates.
+        unit_convertor : UnitConvertor
+            An instance of the UnitConvertor class for unit conversions.
         offset : tuple, optional
             Offset to be applied to the coordinates.
         store_precision : str, optional
             The precision policy for storing data.
         """
+        # Set the unit convertor object
+        self.unit_convertor = unit_convertor
+
         # Process the multires geometry and extract coordinates and connectivity in the coordinate system of the finest level
-        coordinates, connectivity, level_id_field, total_cells = self.process_geometry(levels_data, scale)
+        coordinates, connectivity, level_id_field, total_cells = self.process_geometry(levels_data)
 
         # Ensure that coordinates and connectivity are not empty
         assert coordinates.size != 0, "Error: No valid data to process. Check the input levels_data."
@@ -152,8 +187,8 @@ def __init__(self, field_name_cardinality_dict, levels_data, scale=1, offset=(0.
         # Merge duplicate points
         coordinates, connectivity = self._merge_duplicates(coordinates, connectivity, levels_data)
 
-        # Apply scale and offset
-        coordinates = self._transform_coordinates(coordinates, scale, offset)
+        # Transform coordinates to physical units and apply offset if provided
+        coordinates = self._transform_coordinates(coordinates, offset)
 
         # Assign to self
         self.field_name_cardinality_dict = field_name_cardinality_dict
@@ -177,7 +212,7 @@ def __init__(self, field_name_cardinality_dict, levels_data, scale=1, offset=(0.
         # Construct the NEON container for exporting multi-resolution data
         self.container = self._construct_neon_container()
 
-    def process_geometry(self, levels_data, scale):
+    def process_geometry(self, levels_data):
         num_voxels_per_level = [np.sum(data) for data, _, _, _ in levels_data]
         num_points_per_level = [8 * nv for nv in num_voxels_per_level]
         point_id_offsets = np.cumsum([0] + num_points_per_level[:-1])
@@ -189,10 +224,10 @@ def process_geometry(self, levels_data, scale):
 
         for level_idx, (data, voxel_size, origin, level) in enumerate(levels_data):
             origin = origin * voxel_size
-            corners_list, conn_list = self._process_level(data, voxel_size, origin, level, point_id_offsets[level_idx])
+            corners_list, conn_list = self._process_level(data, voxel_size, origin, point_id_offsets[level_idx])
 
             if corners_list:
-                print(f"\tProcessing level {level}: Voxel size {voxel_size * scale}, Origin {origin}, Shape {data.shape}")
+                print(f"\tProcessing level {level}: Voxel size {voxel_size}, Origin {origin}, Shape {data.shape}")
                 all_corners.extend(corners_list)
                 all_connectivity.extend(conn_list)
                 num_cells = sum(c.shape[0] for c in conn_list)
@@ -208,13 +243,13 @@ def process_geometry(self, levels_data, scale):
 
         return coordinates, connectivity, level_id_field, total_cells
 
-    def _process_level(self, data, voxel_size, origin, level, point_id_offset):
+    def _process_level(self, data, voxel_size, origin, point_id_offset):
         """
         Given a voxel grid, returns all corners and connectivity in NumPy for this resolution level.
         """
         true_indices = np.argwhere(data)
         if true_indices.size == 0:
-            return [], [], level
+            return [], []
 
         max_voxels_per_chunk = 268_435_450
         chunks = np.array_split(true_indices, max(1, (len(true_indices) + max_voxels_per_chunk - 1) // max_voxels_per_chunk))
@@ -368,10 +403,11 @@ def _merge_duplicates(self, coordinates, connectivity, levels_data):
         connectivity = mapping[connectivity]
         return coordinates, connectivity
 
-    def _transform_coordinates(self, coordinates, scale, offset):
-        scale = np.array([scale] * 3 if isinstance(scale, (int, float)) else scale, dtype=np.float32)
+    def _transform_coordinates(self, coordinates, offset):
         offset = np.array(offset, dtype=np.float32)
-        return coordinates * scale + offset
+        if self.unit_convertor is not None:
+            coordinates = self.unit_convertor.length_to_physical(coordinates)
+        return coordinates + offset
 
     def _prepare_container_inputs(self):
         # load necessary modules
@@ -445,7 +481,7 @@ def get_fields_data(self, field_neon_dict):
 
         # Ensure that this operator is called on multires grids
         grid_mres = next(iter(field_neon_dict.values())).get_grid()
-        assert grid_mres.name == "mGrid", f"Operation {self.__class__.__name} is only applicable to multi-resolution cases"
+        assert grid_mres.name == "mGrid", f"Operation {self.__class__.__name} is only applicable to multi-resolution cases!"
 
         for field_name in field_neon_dict.keys():
             assert field_name in self.field_name_cardinality_dict.keys(), (
@@ -488,9 +524,19 @@ def get_fields_data(self, field_neon_dict):
             fields_data[field_name] = np.concatenate(fields_data[field_name])
             assert fields_data[field_name].size == self.total_cells, f"Error: Field {field_name} size mismatch!"
 
+            # Unit conversion if applicable
+            if self.unit_convertor is not None:
+                if "velocity" in field_name.lower():
+                    fields_data[field_name] = self.unit_convertor.velocity_to_physical(fields_data[field_name])
+                elif "density" in field_name.lower():
+                    fields_data[field_name] = self.unit_convertor.density_to_physical(fields_data[field_name])
+                elif "pressure" in field_name.lower():
+                    fields_data[field_name] = self.unit_convertor.pressure_to_physical(fields_data[field_name])
+                # Add more physical quantities as needed
+
         return fields_data
 
-    def to_hdf5(self, output_filename, field_neon_dict, compression="gzip", compression_opts=0, store_precision=None):
+    def to_hdf5(self, output_filename, field_neon_dict, compression="gzip", compression_opts=0):
         """
         Export the multi-resolution mesh data to an HDF5 file.
         Parameters
@@ -503,8 +549,6 @@ def to_hdf5(self, output_filename, field_neon_dict, compression="gzip", compress
             The compression method to use for the HDF5 file.
         compression_opts : int, optional
             The compression options to use for the HDF5 file.
-        store_precision : str, optional
-            The precision policy for storing data in the HDF5 file.
         """
         import time
 
diff --git a/xlb/utils/utils.py b/xlb/utils/utils.py
index a3aed2af..4f8aa377 100644
--- a/xlb/utils/utils.py
+++ b/xlb/utils/utils.py
@@ -408,3 +408,92 @@ def __call__(self, field):
 
         else:
             raise ValueError("Unsupported compute backend!")
+
+
+class UnitConvertor(object):
+    def __init__(
+        self,
+        velocity_lbm_unit: float,
+        velocity_physical_unit: float,
+        voxel_size_physical_unit: float,
+        density_physical_unit: float = 1.2041,
+        pressure_physical_unit: float = 1.101325e5,
+    ):
+        """
+        Initialize the UnitConvertor object.
+
+        Parameters
+        ----------
+        velocity_lbm_unit : float
+            The reference velocity in lattice Boltzmann units.
+        velocity_physical_unit : float
+            The reference velocity in physical units (e.g., m/s).
+        voxel_size_physical_unit : float
+            The size of a voxel in physical units (e.g., meters).
+        density_physical_unit : float, optional
+            The reference density in physical units (e.g., kg/m^3). Default is 1.2041 (density of air at room temperature).
+        pressure_physical_unit : float, optional
+            The reference pressure in physical units (e.g., Pascals). Default is 1.101325e5 (atmospheric pressure at sea level).
+        """
+
+        self.voxel_size = voxel_size_physical_unit
+        self.velocity_lbm_unit = velocity_lbm_unit
+        self.velocity_phys_unit = velocity_physical_unit
+
+        # Reference density and pressure in physical units
+        self.reference_density = density_physical_unit
+        self.referece_pressure = pressure_physical_unit
+
+    @property
+    def time_step_physical(self):
+        return self.voxel_size * self.velocity_lbm_unit / self.velocity_phys_unit
+
+    @property
+    def reference_length(self):
+        return self.voxel_size
+
+    @property
+    def reference_time(self):
+        return self.time_step_physical
+
+    @property
+    def reference_velocity(self):
+        return self.reference_length / self.reference_time
+
+    def length_to_lbm(self, length_phys):
+        return length_phys / self.reference_length
+
+    def length_to_physical(self, length_lbm):
+        return length_lbm * self.reference_length
+
+    def time_to_lbm(self, time_phys):
+        return time_phys / self.reference_time
+
+    def time_to_physical(self, time_lbm):
+        return time_lbm * self.reference_time
+
+    def density_to_lbm(self, rho_phys):
+        return rho_phys / self.reference_density
+
+    def density_to_physical(self, rho_lbm):
+        return rho_lbm * self.reference_density
+
+    def velocity_to_lbm(self, velocity_phys):
+        return velocity_phys / self.reference_velocity
+
+    def velocity_to_physical(self, velocity_lbm):
+        return velocity_lbm * self.reference_velocity
+
+    def viscosity_to_lbm(self, viscosity_phys):
+        return viscosity_phys * (self.reference_time / (self.reference_length**2))
+
+    def viscosity_to_physical(self, viscosity_lbm):
+        return viscosity_lbm * (self.reference_length**2 / self.reference_time)
+
+    def pressure_to_lbm(self, pressure_phys):
+        pressure_perturbation = pressure_phys - self.reference_pressure
+        return pressure_perturbation / self.reference_density / self.reference_velocity**2
+
+    def pressure_to_physical(self, pressure_lbm):
+        pressure_perturbation = pressure_lbm - 1.0 / 3.0
+        return self.referece_pressure + pressure_perturbation * self.reference_density * (self.reference_velocity**2)

From 3f1116fef86e71db823a5e56ba726b146ececd6f Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Sat, 11 Oct 2025 21:39:22 -0400
Subject: [PATCH 190/208] Added Ahmed benchmark example

---
 .gitignore                            |   3 +
 examples/cfd/data/ahmed.json          |  22 +
 examples/cfd/grid_refinement/ahmed.py | 566 ++++++++++++++++++++++++++
 3 files changed, 591 insertions(+)
 create mode 100644 examples/cfd/data/ahmed.json
 create mode 100644 examples/cfd/grid_refinement/ahmed.py

diff --git a/.gitignore b/.gitignore
index b65678c8..926cf2ef 100644
--- a/.gitignore
+++ b/.gitignore
@@ -160,3 +160,6 @@ build/
 # Ignore h5 and xmf formats
 *.h5
 *.xmf
+
+# Ignore CSV files
+*.csv
\ No newline at end of file
diff --git a/examples/cfd/data/ahmed.json b/examples/cfd/data/ahmed.json
new file mode 100644
index 00000000..59ee02e7
--- /dev/null
+++ b/examples/cfd/data/ahmed.json
@@ -0,0 +1,22 @@
+{
+    "_comment": "Ahmed Car Model, slant - angle = 25 degree. Profiles on symmetry plane (y=0) covering entire field. Origin of coordinate system: x=0: end of the car, y=0: symmetry plane, z=0: ground plane S.Becker/H. Lienhart/C Stoots, Institute of Fluid Mechanics, University Erlangen-Nuremberg, Erlangen, Germany, Coordinates in meters need to convert to voxels, Velocity data in m/s",
+    "data": {
+    "-1.162" : { "x-velocity" : [26.995,29.825,29.182,28.488,27.703,26.988,26.456,26.163,26.190,26.523,27.083,28.033,29.131,30.429,31.747,33.036,34.268,35.354,36.312,37.083,37.770,38.484,39.033,39.447,39.839,40.086,40.268,40.380,40.451], "height" : [0.028,0.048,0.068,0.088,0.108,0.128,0.148,0.168,0.188,0.208,0.228,0.248,0.268,0.288,0.308,0.328,0.348,0.368,0.388,0.408,0.428,0.458,0.488,0.518,0.558,0.598,0.638,0.688,0.7388]},
+    "-1.062" : { "x-velocity" : [30.307,28.962,25.812,21.232,15.848,10.812,7.459,6.080,5.845,6.196,7.428,10.456,15.718,22.129,28.090,32.707,35.888,37.891,39.071,39.840,40.261,40.604,40.767,40.820,40.870,40.890,40.907,40.871,40.853], "height" : [0.028,0.048,0.068,0.088,0.108,0.128,0.148,0.168,0.188,0.208,0.228,0.248,0.268,0.288,0.308,0.328,0.348,0.368,0.388,0.408,0.428,0.458,0.488,0.518,0.558,0.598,0.638,0.688,0.738]},
+    "-0.962" : { "x-velocity" : [52.216,51.303,50.196,48.833,47.728,46.790,45.514,44.222,43.379,42.829,42.322,42.056,41.876,41.706,41.584], "height" : [0.363,0.368,0.378,0.388,0.398,0.408,0.428,0.458,0.488,0.518,0.558,0.598,0.638,0.688,0.738]},
+    "-0.862" : { "x-velocity" : [46.589,46.538,46.228,46.033,45.810,45.554,45.056,44.369,43.789,43.275,42.789,42.344,42.148,41.913,41.720], "height" : [0.363,0.368,0.378,0.388,0.398,0.408,0.428,0.458,0.488,0.518,0.558,0.598,0.638,0.688,0.738]},
+    "-0.562" : { "x-velocity" : [43.237,43.262,43.248,43.225,43.183,43.145,43.083,43.030,42.904,42.776,42.685,42.434,42.358,42.197,42.042], "height" : [0.363,0.368,0.378,0.388,0.398,0.408,0.428,0.458,0.488,0.518,0.558,0.598,0.638,0.688,0.738]},
+    "-0.362" : { "x-velocity" : [44.493,44.491,44.443,44.379,44.297,44.215,44.067,43.867,43.577,43.306,43.061,42.689,42.527,42.293,42.105], "height" : [0.363,0.368,0.378,0.388,0.398,0.408,0.428,0.458,0.488,0.518,0.558,0.598,0.638,0.688,0.738]},
+    "-0.212" : { "x-velocity" : [49.202,48.429,47.805,46.697,45.883,44.913,44.195,43.650,43.130,42.677,42.432,42.154,41.961], "height" : [0.368,0.378,0.388,0.408,0.428,0.458,0.488,0.518,0.558,0.598,0.638,0.688,0.738]},
+    "-0.162" : { "x-velocity" : [50.511,49.784,48.894,48.103,47.468,46.322,45.563,44.581,43.933,43.383,42.905,42.505,42.293,42.042,41.863], "height" : [0.348,0.358,0.368,0.378,0.388,0.408,0.428,0.458,0.488,0.518,0.558,0.598,0.638,0.688,0.738]},
+    "-0.112" : { "x-velocity" : [27.615,35.449,41.526,46.068,46.277,46.038,45.774,45.505,45.237,44.701,44.326,43.765,43.284,42.890,42.529,42.247,42.082,41.880,41.732], "height" : [0.318,0.323,0.328,0.338,0.348,0.358,0.368,0.378,0.388,0.408,0.428,0.458,0.488,0.518,0.558,0.598,0.638,0.688,0.738]},
+    "-0.062" : { "x-velocity" : [22.891,27.789,32.292,36.568,39.533,41.426,42.371,42.971,43.030,43.081,43.074,43.065,43.039,42.996,42.908,42.665,42.456,42.294,42.105,41.929,41.827,41.660,41.546], "height" : [0.298,0.303,0.308,0.313,0.318,0.323,0.328,0.338,0.348,0.358,0.368,0.378,0.388,0.408,0.428,0.458,0.488,0.518,0.558,0.598,0.638,0.688,0.738]},
+    "-0.012" : { "x-velocity" : [23.304,26.317,29.429,32.341,34.923,37.106,38.673,39.841,40.447,40.780,40.973,41.085,41.193,41.282,41.359,41.442,41.522,41.699,41.737,41.749,41.724,41.714,41.642,41.574,41.518,41.431,41.366], "height" : [0.278,0.283,0.288,0.293,0.298,0.303,0.308,0.313,0.318,0.323,0.328,0.338,0.348,0.358,0.368,0.378,0.388,0.408,0.428,0.458,0.488,0.518,0.558,0.598,0.638,0.688,0.738]},    
+    "0.038" : { "x-velocity" : [42.752,37.392,15.320,-4.501,-8.079,-8.892,-8.420,-7.027,-5.143,-2.903,-0.936,0.927,2.200,3.099,3.622,4.026,4.280,4.520,5.620,8.938,13.913,17.872,21.148,24.814,29.075,33.188,36.424,38.490,39.388,39.675,39.794,39.911,40.007,40.219,40.425,40.643,40.757,40.896,40.994,41.058,41.124,41.127,41.143,41.106,41.080], "height" : [0.028,0.038,0.048,0.058,0.068,0.078,0.088,0.098,0.108,0.118,0.128,0.138,0.148,0.158,0.168,0.178,0.188,0.198,0.208,0.218,0.228,0.238,0.248,0.258,0.268,0.278,0.288,0.298,0.308,0.318,0.328,0.338,0.348,0.368,0.388,0.408,0.428,0.458,0.488,0.518,0.558,0.598,0.638,0.688,0.738]},
+    "0.088" : { "x-velocity" : [41.859,35.830,22.660,7.745,-5.808,-12.650,-14.748,-13.756,-10.659,-6.484,-2.121,1.303,3.672,5.441,7.066,9.157,11.613,14.620,17.662,20.639,23.565,26.437,29.484,32.441,35.024,36.938,37.938,38.377,38.595,38.728,38.856,38.976,39.133,39.438,39.749,39.975,40.129,40.344,40.499,40.649,40.783,40.853,40.927,40.945,40.960], "height" : [0.028,0.038,0.048,0.058,0.068,0.078,0.088,0.098,0.108,0.118,0.128,0.138,0.148,0.158,0.168,0.178,0.188,0.198,0.208,0.218,0.228,0.238,0.248,0.258,0.268,0.278,0.288,0.298,0.308,0.318,0.328,0.338,0.348,0.368,0.388,0.408,0.428,0.458,0.488,0.518,0.558,0.598,0.638,0.688,0.738]},
+    "0.138" : { "x-velocity" : [36.223,32.501,24.752,14.281,2.799,-6.218,-10.908,-11.892,-9.708,-5.258,-0.140,4.331,7.882,10.995,13.961,16.699,19.477,22.063,24.651,27.081,29.524,31.950,34.043,35.594,36.506,37.053,37.386,37.614,37.832,38.032,38.214,38.397,38.575,38.940,39.298,39.533,39.749,40.028,40.206,40.404,40.580,40.691,40.803,40.858,40.921], "height" : [0.028,0.038,0.048,0.058,0.068,0.078,0.088,0.098,0.108,0.118,0.128,0.138,0.148,0.158,0.168,0.178,0.188,0.198,0.208,0.218,0.228,0.238,0.248,0.258,0.268,0.278,0.288,0.298,0.308,0.318,0.328,0.338,0.348,0.368,0.388,0.408,0.428,0.458,0.488,0.518,0.558,0.598,0.638,0.688,0.738]},
+    "0.188" : { "x-velocity" : [29.417,27.755,23.967,18.261,11.662,5.405,0.676,-0.652,0.937,4.261,7.958,11.427,14.366,17.138,19.735,22.151,24.577,26.883,29.165,31.111,32.781,34.072,34.893,35.524,35.974,36.329,36.604,36.872,37.138,37.402,37.673,37.900,38.112,38.518,38.829,39.088,39.326,39.639,39.871,40.096,40.275,40.423,40.523,40.603,40.687], "height" : [0.028,0.038,0.048,0.058,0.068,0.078,0.088,0.098,0.108,0.118,0.128,0.138,0.148,0.158,0.168,0.178,0.188,0.198,0.208,0.218,0.228,0.238,0.248,0.258,0.268,0.278,0.288,0.298,0.308,0.318,0.328,0.338,0.348,0.368,0.388,0.408,0.428,0.458,0.488,0.518,0.558,0.598,0.638,0.688,0.738]},
+    "0.238" : { "x-velocity" : [24.405,24.168,22.782,20.196,16.970,13.937,12.137,11.757,12.851,14.649,16.780,18.995,21.070,23.335,25.280,27.468,29.262,30.832,32.133,33.102,33.856,34.473,34.922,35.340,35.698,36.039,36.336,36.629,36.906,37.193,37.454,37.691,37.929,38.329,38.611,38.875,39.126,39.414,39.677,39.917,40.097,40.259,40.380,40.478,40.568], "height" : [0.028,0.038,0.048,0.058,0.068,0.078,0.088,0.098,0.108,0.118,0.128,0.138,0.148,0.158,0.168,0.178,0.188,0.198,0.208,0.218,0.228,0.238,0.248,0.258,0.268,0.278,0.288,0.298,0.308,0.318,0.328,0.338,0.348,0.368,0.388,0.408,0.428,0.458,0.488,0.518,0.558,0.598,0.638,0.688,0.738]},
+    "0.288" : { "x-velocity" : [21.489,22.225,22.127,21.456,20.404,19.743,19.541,19.909,21.002,22.381,24.018,25.670,27.421,28.998,30.371,31.523,32.406,33.111,33.670,34.155,34.532,34.893,35.240,35.567,35.875,36.158,36.437,36.708,36.974,37.230,37.473,37.709,37.932,38.266,38.515,38.773,39.008,39.270,39.562,39.782,39.962,40.148,40.266,40.369,40.475], "height" : [0.028,0.038,0.048,0.058,0.068,0.078,0.088,0.098,0.108,0.118,0.128,0.138,0.148,0.158,0.168,0.178,0.188,0.198,0.208,0.218,0.228,0.238,0.248,0.258,0.268,0.278,0.288,0.298,0.308,0.318,0.328,0.338,0.348,0.368,0.388,0.408,0.428,0.458,0.488,0.518,0.558,0.598,0.638,0.688,0.738]}
+    }
+}
\ No newline at end of file
diff --git a/examples/cfd/grid_refinement/ahmed.py b/examples/cfd/grid_refinement/ahmed.py
new file mode 100644
index 00000000..f6ef8b20
--- /dev/null
+++ b/examples/cfd/grid_refinement/ahmed.py
@@ -0,0 +1,566 @@
+import neon
+import warp as wp
+import numpy as np
+import time
+import os
+import matplotlib.pyplot as plt
+import trimesh
+import shutil
+
+import xlb
+from xlb.compute_backend import ComputeBackend
+from xlb.precision_policy import PrecisionPolicy
+from xlb.grid import multires_grid_factory
+from xlb.operator.boundary_condition import (
+    DoNothingBC,
+    HybridBC,
+    RegularizedBC,
+)
+from xlb.operator.boundary_masker import MeshVoxelizationMethod
+from xlb.utils.mesher import prepare_sparsity_pattern, make_cuboid_mesh, MultiresIO
+from xlb.utils import UnitConvertor
+from xlb.operator.force import MultiresMomentumTransfer
+from xlb.helper.initializers import CustomMultiresInitializer
+
+wp.clear_kernel_cache()
+wp.config.quiet = True
+
+# User Configuration
+# =================
+# Physical and simulation parameters
+wind_speed_lbm = 0.05  # Lattice velocity
+wind_speed_mps = 38.0  # Physical inlet velocity in m/s (user input)
+flow_passes = 2  # Domain flow passes
+kinematic_viscosity = 1.508e-5  # Kinematic viscosity of air in m^2/s 1.508e-5
+voxel_size = 0.005  # Finest voxel size in meters
+
+# STL filename
+stl_filename = "examples/cfd/stl-files/Ahmed_25_NoLegs.stl"
+script_name = "Ahmed"
+
+# I/O settings
+print_interval_percentage = 1  # Print every 1% of iterations
+file_output_crossover_percentage = 10  # Crossover at 50% of iterations
+num_file_outputs_pre_crossover = 20  # Outputs before crossover
+num_file_outputs_post_crossover = 5  # Outputs after crossover
+
+# Other setup parameters
+compute_backend = ComputeBackend.NEON
+precision_policy = PrecisionPolicy.FP32FP32
+velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=compute_backend)
+
+
+def generate_cuboid_mesh(stl_filename, voxel_size):
+    """
+    Alternative cuboid mesh generation based on Apolo's method with domain multipliers per level.
+    """
+    # Domain multipliers for each refinement level
+    domain_multiplier = [
+        [3.0, 4.0, 2.5, 2.5, 0.0, 4.0],     # -x, x, -y, y, -z, z
+        [1.2, 1.25, 1.75, 1.75, 0.0, 1.5],
+        [0.8, 1.0, 1.25, 1.25, 0.0, 1.2],
+        [0.5, 0.65, 0.6, 0.60, 0.0, 0.6],
+        [0.25, 0.25, 0.25, 0.25, 0.0, 0.25],
+    ]
+
+    # Load the mesh
+    mesh = trimesh.load_mesh(stl_filename, process=False)
+    if mesh.is_empty:
+        raise ValueError("Loaded mesh is empty or invalid.")
+
+    # Compute original bounds
+    min_bound = mesh.vertices.min(axis=0)
+    max_bound = mesh.vertices.max(axis=0)
+    partSize = max_bound - min_bound
+    x0 = max_bound[0]  # End of car for Ahmed
+
+    # Compute translation to put mesh into first octant of the domain
+    stl_shift = np.array(
+        [
+            domain_multiplier[0][0] * partSize[0] - min_bound[0],
+            domain_multiplier[0][2] * partSize[1] - min_bound[1],
+            domain_multiplier[0][4] * partSize[2] - min_bound[2],
+        ],
+        dtype=float,
+    )
+
+    # Apply translation and save out temp STL
+    mesh.apply_translation(stl_shift)
+    _ = mesh.vertex_normals
+    mesh_vertices = np.asarray(mesh.vertices)
+    mesh.export("temp.stl")
+
+    # Generate mesh using make_cuboid_mesh
+    level_data = make_cuboid_mesh(
+        voxel_size,
+        domain_multiplier,
+        "temp.stl",
+    )
+
+    num_levels = len(level_data)
+    grid_shape_finest = tuple([int(i * 2 ** (num_levels - 1)) for i in level_data[-1][0].shape])
+    print(f"Full shape based on finest voxel size is {grid_shape_finest}")
+    os.remove("temp.stl")
+
+    return (
+        level_data,
+        mesh_vertices,
+        tuple([int(a) for a in grid_shape_finest]),
+        stl_shift,
+        x0,
+    )
+
+
+# Boundary Conditions Setup
+# =========================
+def setup_boundary_conditions(grid, level_data, body_vertices, wind_speed_mps):
+    """
+    Set up boundary conditions for the simulation.
+    """
+    # Convert wind speed to lattice units
+    wind_speed_lbm = unit_convertor.velocity_to_lbm(wind_speed_mps)
+
+    left_indices = grid.boundary_indices_across_levels(level_data, box_side="left", remove_edges=True)
+    right_indices = grid.boundary_indices_across_levels(level_data, box_side="right", remove_edges=True)
+    top_indices = grid.boundary_indices_across_levels(level_data, box_side="top", remove_edges=False)
+    bottom_indices = grid.boundary_indices_across_levels(level_data, box_side="bottom", remove_edges=False)
+    front_indices = grid.boundary_indices_across_levels(level_data, box_side="front", remove_edges=False)
+    back_indices = grid.boundary_indices_across_levels(level_data, box_side="back", remove_edges=False)
+
+    # Initialize boundary conditions
+    bc_inlet = RegularizedBC("velocity", prescribed_value=(wind_speed_lbm, 0.0, 0.0), indices=left_indices)
+    bc_outlet = DoNothingBC(indices=right_indices)
+    bc_top = HybridBC(bc_method="nonequilibrium_regularized", indices=top_indices)
+    bc_bottom = HybridBC(bc_method="nonequilibrium_regularized", indices=bottom_indices)
+    bc_front = HybridBC(bc_method="nonequilibrium_regularized", indices=front_indices)
+    bc_back = HybridBC(bc_method="nonequilibrium_regularized", indices=back_indices)
+    bc_body = HybridBC(
+        bc_method="nonequilibrium_regularized",
+        mesh_vertices=unit_convertor.length_to_lbm(body_vertices),
+        voxelization_method=MeshVoxelizationMethod("AABB_CLOSE", close_voxels=4),
+        use_mesh_distance=True,
+    )
+
+    return [bc_top, bc_bottom, bc_front, bc_back, bc_inlet, bc_outlet, bc_body]
+
+
+# Simulation Initialization
+# =========================
+def initialize_simulation(
+    grid, boundary_conditions, omega_finest, initializer, collision_type="KBC", mres_perf_opt=xlb.MresPerfOptimizationType.FUSION_AT_FINEST
+):
+    """
+    Initialize the multiresolution simulation manager.
+    """
+    sim = xlb.helper.MultiresSimulationManager(
+        omega_finest=omega_finest,
+        grid=grid,
+        boundary_conditions=boundary_conditions,
+        collision_type=collision_type,
+        initializer=initializer,
+        mres_perf_opt=mres_perf_opt,
+    )
+    return sim
+
+
+# Utility Functions
+# =================
+def print_lift_drag(sim, step, momentum_transfer, wind_speed_lbm, reference_area):
+    """
+    Calculate and print lift and drag coefficients.
+    """
+    boundary_force = momentum_transfer(sim.f_0, sim.f_1, sim.bc_mask, sim.missing_mask)
+    drag = boundary_force[0]
+    lift = boundary_force[2]
+    cd = 2.0 * drag / (wind_speed_lbm**2 * reference_area)
+    cl = 2.0 * lift / (wind_speed_lbm**2 * reference_area)
+    if np.isnan(cd) or np.isnan(cl):
+        print(f"NaN detected in coefficients at step {step}")
+        raise ValueError(f"NaN detected in coefficients at step {step}: Cd={cd}, Cl={cl}")
+    drag_values.append([cd, cl])
+    return cd, cl, drag
+
+
+def plot_drag_lift(drag_values, output_dir, print_interval, script_name, percentile_range=(15, 85), use_log_scale=False):
+    """
+    Plot CD and CL over time and save the plot to the output directory.
+    """
+    drag_values_array = np.array(drag_values)
+    steps = np.arange(0, len(drag_values) * print_interval, print_interval)
+    cd_values = drag_values_array[:, 0]
+    cl_values = drag_values_array[:, 1]
+    y_min = min(np.percentile(cd_values, percentile_range[0]), np.percentile(cl_values, percentile_range[0]))
+    y_max = max(np.percentile(cd_values, percentile_range[1]), np.percentile(cl_values, percentile_range[1]))
+    padding = (y_max - y_min) * 0.1
+    y_min, y_max = y_min - padding, y_max + padding
+    if use_log_scale:
+        y_min = max(y_min, 1e-6)
+    plt.figure(figsize=(10, 6))
+    plt.plot(steps, cd_values, label="Drag Coefficient (Cd)", color="blue")
+    plt.plot(steps, cl_values, label="Lift Coefficient (Cl)", color="red")
+    plt.xlabel("Simulation Step")
+    plt.ylabel("Coefficient")
+    plt.title(f"{script_name}: Drag and Lift Coefficients Over Time")
+    plt.legend()
+    plt.grid(True)
+    plt.ylim(y_min, y_max)
+    if use_log_scale:
+        plt.yscale("log")
+    plt.savefig(os.path.join(output_dir, "drag_lift_plot.png"))
+    plt.close()
+
+
+def compute_voxel_statistics(sim, bc_mask_exporter, sparsity_pattern, boundary_conditions, unit_convertor):
+    """
+    Compute active/solid voxels, totals, lattice updates, and reference area based on simulation data.
+    """
+    fields_data = bc_mask_exporter.get_fields_data({"bc_mask": sim.bc_mask})
+    bc_mask_data = fields_data["bc_mask_0"]
+    level_id_field = bc_mask_exporter.level_id_field
+
+    # Compute solid voxels per level (assuming 255 is the solid marker)
+    solid_voxels = []
+    for lvl in range(num_levels):
+        level_mask = level_id_field == lvl
+        solid_voxels.append(np.sum(bc_mask_data[level_mask] == 255))
+
+    # Compute active voxels (total non-zero in sparsity minus solids)
+    active_voxels = [np.count_nonzero(mask) for mask in sparsity_pattern]
+    active_voxels = [max(0, active_voxels[lvl] - solid_voxels[lvl]) for lvl in range(num_levels)]
+
+    # Totals
+    total_voxels = sum(active_voxels)
+    total_lattice_updates_per_step = sum(active_voxels[lvl] * (2 ** (num_levels - 1 - lvl)) for lvl in range(num_levels))
+
+    # Compute reference area (projected on YZ plane at finest level)
+    finest_level = 0
+    mask_finest = level_id_field == finest_level
+    bc_mask_finest = bc_mask_data[mask_finest]
+    active_indices_finest = np.argwhere(sparsity_pattern[0])
+    bc_body_id = boundary_conditions[-1].id  # Assuming last BC is bc_body
+    solid_voxels_indices = active_indices_finest[bc_mask_finest == bc_body_id]
+    unique_jk = np.unique(solid_voxels_indices[:, 1:3], axis=0)
+    reference_area = unique_jk.shape[0]
+    reference_area_physical = reference_area * unit_convertor.reference_length**2
+
+    return {
+        "active_voxels": active_voxels,
+        "solid_voxels": solid_voxels,
+        "total_voxels": total_voxels,
+        "total_lattice_updates_per_step": total_lattice_updates_per_step,
+        "reference_area": reference_area,
+        "reference_area_physical": reference_area_physical,
+    }
+
+
+def plot_data(x0, output_dir, delta_x_coarse, sim, IOexporter, prefix="Ahmed"):
+    """
+    Ahmed Car Model, slant - angle = 25 degree
+    Profiles on symmetry plane (y=0) covering entire field
+    Origin of coordinate system:
+         x=0: end of the car, y=0: symmetry plane, z=0: ground plane
+
+    S.Becker/H. Lienhart/C.Stoots
+    Insitute of Fluid Mechanics
+    University Erlangen-Nuremberg
+    Erlangen, Germany
+    Coordaintes in meters need to convert to voxels
+    Velocity data in m/s
+    """
+
+    def _load_sim_line(csv_path):
+        """
+        Read a CSV exported by IOexporter.to_line without pandas.
+        Returns (z, Ux).
+        """
+        # Read with header as column names
+        data = np.genfromtxt(
+            csv_path,
+            delimiter=",",
+            names=True,
+            autostrip=True,
+            dtype=None,
+            encoding="utf-8",
+        )
+        if data.size == 0:
+            raise ValueError(f"No data in {csv_path}")
+
+        z = np.asarray(data["z"], dtype=float)
+        ux = np.asarray(data["value"], dtype=float)
+        return z, ux
+
+    # Load reference data
+    import json
+
+    ref_data_path = "examples/cfd/data/ahmed.json"
+    with open(ref_data_path, "r") as file:
+        data = json.load(file)
+
+    for x_str in data["data"].keys():
+        # Extract reference horizontal velocity in m/s and its corresponding height in m
+        refX = np.array(data["data"][x_str]["x-velocity"])
+        refY = np.array(data["data"][x_str]["height"])
+
+        # From reference x0 (rear of body) find x1 for plot
+        x_pos = float(x_str)
+        x1 = x0 + x_pos
+
+        print(f" x1 is {x1}")
+        sim.macro(sim.f_0, sim.bc_mask, sim.rho, sim.u, streamId=0)
+        filename = os.path.join(output_dir, f"{prefix}_{x_str}")
+        wp.synchronize()
+        IOexporter.to_line(
+            filename,
+            {"velocity": sim.u},
+            start_point=(x1, 0, 0),
+            end_point=(x1, 0, 0.8),
+            resolution=250,
+            component=0,
+            radius=delta_x_coarse,  # needed with model units
+        )
+        # read the CSV written by the exporter
+        csv_path = filename + "_velocity_0.csv"
+        print(f"CSV path is {csv_path}")
+
+        try:
+            sim_z, sim_ux = _load_sim_line(csv_path)
+        except Exception as e:
+            print(f"Failed to read {csv_path}: {e}")
+            continue
+
+        # plot reference vs simulation
+        plt.figure(figsize=(4.5, 6))
+        plt.plot(refX, refY, "o", mfc="none", label="Experimental)")
+        plt.plot(sim_ux, sim_z, "-", lw=2, label="Simulation")
+        plt.xlim(np.min(refX) * 0.9, np.max(refX) * 1.1)
+        plt.ylim(np.min(refY), np.max(refY))
+        plt.xlabel("Ux [m/s]")
+        plt.ylabel("z [m]")
+        plt.title(f"Velocity Plot at {x_pos:+.3f}")
+        plt.grid(True, alpha=0.3)
+        plt.legend()
+        plt.tight_layout()
+        plt.savefig(filename + ".png", dpi=150)
+        plt.close()
+
+
+# Main Script
+# ===========
+# Initialize XLB
+
+xlb.init(
+    velocity_set=velocity_set,
+    default_backend=compute_backend,
+    default_precision_policy=precision_policy,
+)
+
+# Generate mesh
+level_data, body_vertices, grid_shape_zip, stl_shift, x0 = generate_cuboid_mesh(stl_filename, voxel_size)
+
+# Prepare the sparsity pattern and origins from the level data
+sparsity_pattern, level_origins = prepare_sparsity_pattern(level_data)
+
+# Define a unit convertor
+unit_convertor = UnitConvertor(
+    velocity_lbm_unit=wind_speed_lbm,
+    velocity_physical_unit=wind_speed_mps,
+    voxel_size_physical_unit=voxel_size,
+)
+
+# Calculate lattice parameters
+num_levels = len(level_data)
+delta_x_coarse = voxel_size * 2 ** (num_levels - 1)
+nu_lattice = unit_convertor.viscosity_to_lbm(kinematic_viscosity)
+omega_finest = 1.0 / (3.0 * nu_lattice + 0.5)
+
+# Create output directory
+current_dir = os.path.join(os.path.dirname(__file__))
+output_dir = os.path.join(current_dir, script_name)
+if os.path.exists(output_dir):
+    shutil.rmtree(output_dir)
+os.makedirs(output_dir)
+
+# Define exporter objects
+field_name_cardinality_dict = {"velocity": 3, "density": 1}
+h5exporter = MultiresIO(
+    field_name_cardinality_dict,
+    level_data,
+    offset=-stl_shift,
+    unit_convertor=unit_convertor,
+)
+bc_mask_exporter = MultiresIO(
+    {"bc_mask": 1},
+    level_data,
+    offset=-stl_shift,
+    unit_convertor=unit_convertor,
+)
+
+# Create grid
+grid = multires_grid_factory(
+    grid_shape_zip,
+    velocity_set=velocity_set,
+    sparsity_pattern_list=sparsity_pattern,
+    sparsity_pattern_origins=[neon.Index_3d(*box_origin) for box_origin in level_origins],
+)
+
+# Calculate num_steps
+coarsest_level = grid.count_levels - 1
+grid_shape_x_coarsest = grid.level_to_shape(coarsest_level)[0]
+num_steps = int(flow_passes * (grid_shape_x_coarsest / wind_speed_lbm))
+
+# Calculate print and file output intervals
+print_interval = max(1, int(num_steps * (print_interval_percentage / 100.0)))
+crossover_step = int(num_steps * (file_output_crossover_percentage / 100.0))
+file_output_interval_pre_crossover = (
+    max(1, int(crossover_step / num_file_outputs_pre_crossover)) if num_file_outputs_pre_crossover > 0 else num_steps + 1
+)
+file_output_interval_post_crossover = (
+    max(1, int((num_steps - crossover_step) / num_file_outputs_post_crossover)) if num_file_outputs_post_crossover > 0 else num_steps + 1
+)
+
+# Setup boundary conditions
+boundary_conditions = setup_boundary_conditions(grid, level_data, body_vertices, wind_speed_mps)
+
+# Create initializer
+wind_speed_lbm = unit_convertor.velocity_to_lbm(wind_speed_mps)
+initializer = CustomMultiresInitializer(
+    bc_id=boundary_conditions[-2].id,  # bc_outlet
+    constant_velocity_vector=(wind_speed_lbm, 0.0, 0.0),
+    velocity_set=velocity_set,
+    precision_policy=precision_policy,
+    compute_backend=compute_backend,
+)
+
+# Initialize simulation
+sim = initialize_simulation(grid, boundary_conditions, omega_finest, initializer)
+
+# Compute voxel statistics and reference area
+stats = compute_voxel_statistics(sim, bc_mask_exporter, sparsity_pattern, boundary_conditions, unit_convertor)
+active_voxels = stats["active_voxels"]
+solid_voxels = stats["solid_voxels"]
+total_voxels = stats["total_voxels"]
+total_lattice_updates_per_step = stats["total_lattice_updates_per_step"]
+reference_area = stats["reference_area"]
+reference_area_physical = stats["reference_area_physical"]
+
+# Save initial bc_mask
+filename = os.path.join(output_dir, f"{script_name}_initial_bc_mask")
+try:
+    bc_mask_exporter.to_hdf5(filename, {"bc_mask": sim.bc_mask}, compression="gzip", compression_opts=0)
+    xmf_filename = f"{filename}.xmf"
+    hdf5_basename = f"{script_name}_initial_bc_mask.h5"
+except Exception as e:
+    print(f"Error during initial bc_mask output: {e}")
+wp.synchronize()
+
+
+# Setup momentum transfer
+momentum_transfer = MultiresMomentumTransfer(
+    boundary_conditions[-1],
+    mres_perf_opt=xlb.MresPerfOptimizationType.FUSION_AT_FINEST,
+    compute_backend=compute_backend,
+)
+
+# Print simulation info
+print("\n" + "=" * 50 + "\n")
+print(f"Number of flow passes: {flow_passes}")
+print(f"Calculated iterations: {num_steps:,}")
+print(f"Finest voxel size: {voxel_size} meters")
+print(f"Coarsest voxel size: {delta_x_coarse} meters")
+print(f"Total voxels: {sum(np.count_nonzero(mask) for mask in sparsity_pattern):,}")
+print(f"Total active voxels: {total_voxels:,}")
+print(f"Active voxels per level: {active_voxels}")
+print(f"Solid voxels per level: {solid_voxels}")
+print(f"Total lattice updates per global step: {total_lattice_updates_per_step:,}")
+print(f"Number of refinement levels: {num_levels}")
+print(f"Physical inlet velocity: {wind_speed_mps:.4f} m/s")
+print(f"Lattice velocity (ulb): {wind_speed_lbm}")
+print(f"Computed reference area (bc_mask): {reference_area} lattice units")
+print(f"Physical reference area (bc_mask): {reference_area_physical:.6f} m^2")
+print("\n" + "=" * 50 + "\n")
+
+# -------------------------- Simulation Loop --------------------------
+wp.synchronize()
+start_time = time.time()
+compute_time = 0.0
+steps_since_last_print = 0
+drag_values = []
+
+for step in range(num_steps):
+    step_start = time.time()
+    sim.step()
+    wp.synchronize()
+    compute_time += time.time() - step_start
+    steps_since_last_print += 1
+    if step % print_interval == 0 or step == num_steps - 1:
+        sim.macro(sim.f_0, sim.bc_mask, sim.rho, sim.u, streamId=0)
+        wp.synchronize()
+        cd, cl, drag = print_lift_drag(sim, step, momentum_transfer, wind_speed_lbm, reference_area)
+        filename = os.path.join(output_dir, f"{script_name}_{step:04d}")
+        h5exporter.to_hdf5(filename, {"velocity": sim.u, "density": sim.rho}, compression="gzip", compression_opts=0)
+        h5exporter.to_slice_image(
+            filename,
+            {"velocity": sim.u},
+            plane_point=(1, 0, 0),
+            plane_normal=(0, 1, 0),
+            grid_res=2000,
+            bounds=(0.25, 0.75, 0, 0.5),
+            show_axes=False,
+            show_colorbar=False,
+            slice_thickness=delta_x_coarse,  # needed when using model units
+        )
+        end_time = time.time()
+        elapsed = end_time - start_time
+        total_lattice_updates = total_lattice_updates_per_step * steps_since_last_print
+        MLUPS = total_lattice_updates / compute_time / 1e6 if compute_time > 0 else 0.0
+        current_flow_passes = step * wind_speed_lbm / grid_shape_x_coarsest
+        remaining_steps = num_steps - step - 1
+        time_remaining = 0.0 if MLUPS == 0 else (total_lattice_updates_per_step * remaining_steps) / (MLUPS * 1e6)
+        hours, rem = divmod(time_remaining, 3600)
+        minutes, seconds = divmod(rem, 60)
+        time_remaining_str = f"{int(hours):02d}h {int(minutes):02d}m {int(seconds):02d}s"
+        percent_complete = (step + 1) / num_steps * 100
+        print(f"Completed step {step}/{num_steps} ({percent_complete:.2f}% complete)")
+        print(f"  Flow Passes: {current_flow_passes:.2f}")
+        print(f"  Time elapsed: {elapsed:.1f}s, Compute time: {compute_time:.1f}s, ETA: {time_remaining_str}")
+        print(f"  MLUPS: {MLUPS:.1f}")
+        print(f"  Cd={cd:.3f}, Cl={cl:.3f}, Drag Force (lattice units)={drag:.3f}")
+        start_time = time.time()
+        compute_time = 0.0
+        steps_since_last_print = 0
+    file_output_interval = file_output_interval_pre_crossover if step < crossover_step else file_output_interval_post_crossover
+    if step % file_output_interval == 0 or step == num_steps - 1:
+        sim.macro(sim.f_0, sim.bc_mask, sim.rho, sim.u, streamId=0)
+        filename = os.path.join(output_dir, f"{script_name}_{step:04d}")
+        try:
+            h5exporter.to_hdf5(filename, {"velocity": sim.u, "density": sim.rho}, compression="gzip", compression_opts=0)
+            xmf_filename = f"{filename}.xmf"
+            hdf5_basename = f"{script_name}_{step:04d}.h5"
+        except Exception as e:
+            print(f"Error during file output at step {step}: {e}")
+        wp.synchronize()
+    if step == num_steps - 1:
+        plot_data(x0, output_dir, delta_x_coarse, sim, h5exporter, prefix="Ahmed")
+
+# Save drag and lift data to CSV
+if len(drag_values) > 0:
+    with open(os.path.join(output_dir, "drag_lift.csv"), "w") as fd:
+        fd.write("Step,Cd,Cl\n")
+        for i, (cd, cl) in enumerate(drag_values):
+            fd.write(f"{i * print_interval},{cd},{cl}\n")
+    plot_drag_lift(drag_values, output_dir, print_interval, script_name)
+
+# Calculate and print average Cd and Cl for the last 50%
+drag_values_array = np.array(drag_values)
+if len(drag_values) > 0:
+    start_index = len(drag_values) // 2
+    last_half = drag_values_array[start_index:, :]
+    avg_cd = np.mean(last_half[:, 0])
+    avg_cl = np.mean(last_half[:, 1])
+    print(f"Average Drag Coefficient (Cd) for last 50%: {avg_cd:.6f}")
+    print(f"Average Lift Coefficient (Cl) for last 50%: {avg_cl:.6f}")
+    print(f"Experimental Drag Coefficient (Cd): {0.3088}")
+    print(f"Error Drag Coefficient (Cd): {((avg_cd - 0.3088) / 0.3088) * 100:.2f}%")
+
+else:
+    print("No drag or lift data collected.")

From 3fcf15125cb1dc1df4d261ac6fcfe67678309b86 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Fri, 14 Nov 2025 09:51:13 +0100
Subject: [PATCH 191/208] refactoring(mres-stepper): new 'add_to_app'
 functionality

Simplifies the `add_to_app` method in the multiresolution stepper.
It now leverages keyword arguments and introspection for more flexible and maintainable operator calls.
This change enhances code readability and reduces the risk of errors when adding new operators.
---
 xlb/helper/simulation_manager.py             | 44 +++++-----
 xlb/operator/stepper/nse_multires_stepper.py | 87 ++++++++++++++++----
 2 files changed, 95 insertions(+), 36 deletions(-)

diff --git a/xlb/helper/simulation_manager.py b/xlb/helper/simulation_manager.py
index 8abc56b6..19a2f45d 100644
--- a/xlb/helper/simulation_manager.py
+++ b/xlb/helper/simulation_manager.py
@@ -105,7 +105,7 @@ def recursion_reference(level, app):
             self.add_to_app(
                 app=app,
                 op_name="collide_coarse",
-                mres_level=level,
+                level=level,
                 f_0=self.f_0,
                 f_1=self.f_1,
                 bc_mask=self.bc_mask,
@@ -122,7 +122,7 @@ def recursion_reference(level, app):
             self.add_to_app(
                 app=app,
                 op_name="stream_coarse_step_ABC",
-                mres_level=level,
+                level=level,
                 f_0=self.f_1,
                 f_1=self.f_0,
                 bc_mask=self.bc_mask,
@@ -144,11 +144,11 @@ def recursion_fused_finest(level, app):
                 self.add_to_app(
                     app=app,
                     op_name="finest_fused_pull",
-                    mres_level=level,
-                    f_0=self.f_0,
-                    f_1=self.f_1,
-                    bc_mask=self.bc_mask,
-                    missing_mask=self.missing_mask,
+                    level=level,
+                    f_0_fd=self.f_0,
+                    f_1_fd=self.f_1,
+                    bc_mask_fd=self.bc_mask,
+                    missing_mask_fd=self.missing_mask,
                     omega=omega,
                     timestep=0,
                     is_f1_the_explosion_src_field=True,
@@ -156,11 +156,11 @@ def recursion_fused_finest(level, app):
                 self.add_to_app(
                     app=app,
                     op_name="finest_fused_pull",
-                    mres_level=level,
-                    f_0=self.f_1,
-                    f_1=self.f_0,
-                    bc_mask=self.bc_mask,
-                    missing_mask=self.missing_mask,
+                    level=level,
+                    f_0_fd=self.f_1,
+                    f_1_fd=self.f_0,
+                    bc_mask_fd=self.bc_mask,
+                    missing_mask_fd=self.missing_mask,
                     omega=omega,
                     timestep=0,
                     is_f1_the_explosion_src_field=False,
@@ -173,11 +173,11 @@ def recursion_fused_finest(level, app):
             self.add_to_app(
                 app=app,
                 op_name="collide_coarse",
-                mres_level=level,
-                f_0=self.f_0,
-                f_1=self.f_1,
-                bc_mask=self.bc_mask,
-                missing_mask=self.missing_mask,
+                level=level,
+                f_0_fd=self.f_0,
+                f_1_fd=self.f_1,
+                bc_mask_fd=self.bc_mask,
+                missing_mask_fd=self.missing_mask,
                 omega=omega,
                 timestep=0,
             )
@@ -196,11 +196,11 @@ def recursion_fused_finest(level, app):
             self.add_to_app(
                 app=app,
                 op_name="stream_coarse_step_ABC",
-                mres_level=level,
-                f_0=self.f_1,
-                f_1=self.f_0,
-                bc_mask=self.bc_mask,
-                missing_mask=self.missing_mask,
+                level=level,
+                f_0_fd=self.f_1,
+                f_1_fd=self.f_0,
+                bc_mask_fd=self.bc_mask,
+                missing_mask_fd=self.missing_mask,
                 omega=self.coalescence_factor,
                 timestep=0,
             )
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 63ff560d..d18257f4 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -896,22 +896,81 @@ def launch_container(self, streamId, op_name, mres_level, f_0, f_1, bc_mask, mis
 
     def add_to_app(
         self,
-        app,
-        op_name,
-        mres_level,
-        f_0,
-        f_1,
-        bc_mask,
-        missing_mask,
-        omega,
-        timestep,
-        is_f1_the_explosion_src_field: bool = None,
+            **kwargs
     ):
+        import inspect
+
+        def validate_kwargs_forward(func, kwargs):
+            """
+            Check whether `func(**kwargs)` would be valid,
+            and return *all* the issues instead of raising on the first one.
+
+            Returns a dict; empty dict means "everything is OK".
+            """
+            sig = inspect.signature(func)
+            params = sig.parameters
+
+            errors = {}
+
+            # --- 1. Positional-only required params (cannot be given via kwargs) ---
+            pos_only_required = [
+                name for name, p in params.items()
+                if p.kind == inspect.Parameter.POSITIONAL_ONLY
+                   and p.default is inspect._empty
+            ]
+            if pos_only_required:
+                errors["positional_only_required"] = pos_only_required
+
+            # --- 2. Unexpected kwargs (if no **kwargs in target) ---
+            has_var_kw = any(
+                p.kind == inspect.Parameter.VAR_KEYWORD
+                for p in params.values()
+            )
+            if not has_var_kw:
+                allowed_kw = {
+                    name for name, p in params.items()
+                    if p.kind in (
+                        inspect.Parameter.POSITIONAL_OR_KEYWORD,
+                        inspect.Parameter.KEYWORD_ONLY,
+                    )
+                }
+                unexpected = sorted(set(kwargs) - allowed_kw)
+                if unexpected:
+                    errors["unexpected_kwargs"] = unexpected
+
+            # --- 3. Missing required keyword-passable params ---
+            missing_required = [
+                name for name, p in params.items()
+                if p.kind in (
+                    inspect.Parameter.POSITIONAL_OR_KEYWORD,
+                    inspect.Parameter.KEYWORD_ONLY,
+                )
+                   and p.default is inspect._empty  # no default
+                   and name not in kwargs  # not provided
+            ]
+            if missing_required:
+                errors["missing_required"] = missing_required
+
+            return errors
+
+        container_generator = None
+        try:
+            op_name = kwargs.pop("op_name")
+            app = kwargs.pop("app")
+        except:
+            raise ValueError("op_name and app must be provided as keyword arguments")
+
+        try:
+            container_generator = self.neon_container[op_name]
+        except KeyError:
+            raise ValueError(f"Operator {op_name} not found in neon container. Available operators: {list(self.neon_container.keys())}")
+
+        errors = validate_kwargs_forward(container_generator, kwargs)
+        if errors:
+            raise ValueError(f"Cannot forward kwargs to target: {errors}")
+
         nvtx.push_range(f"New Container {op_name}", color="yellow")
-        if is_f1_the_explosion_src_field is None:
-            app.append(self.neon_container[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep))
-        else:
-            app.append(self.neon_container[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep, is_f1_the_explosion_src_field))
+        app.append(container_generator(**kwargs))
         nvtx.pop_range()
 
     @Operator.register_backend(ComputeBackend.NEON)

From 69a16213623f796200ca32e8de3d4336e4f14cce Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Thu, 27 Nov 2025 08:38:36 +0100
Subject: [PATCH 192/208] (feature) Two new optimization strategies for the
 mres stepper.

---
 examples/performance/mlups_3d_multires.py    |   2 +-
 xlb/helper/simulation_manager.py             | 238 ++++++
 xlb/mres_perf_optimization_type.py           |   2 +
 xlb/operator/stepper/nse_multires_stepper.py | 787 ++++++++++++++++---
 4 files changed, 930 insertions(+), 99 deletions(-)

diff --git a/examples/performance/mlups_3d_multires.py b/examples/performance/mlups_3d_multires.py
index 5b5851da..5486794c 100644
--- a/examples/performance/mlups_3d_multires.py
+++ b/examples/performance/mlups_3d_multires.py
@@ -275,7 +275,7 @@ def run(velocity_set,
     omega_finest = 1.0 / (3.0 * visc + 0.5)
 
     # Define a multi-resolution simulation manager
-    sim = xlb.helper.MultiresSimulationManager(omega=omega,
+    sim = xlb.helper.MultiresSimulationManager(omega_finest=omega_finest,
                                                grid=grid,
                                                boundary_conditions=boundary_conditions,
                                                collision_type=collision_model,
diff --git a/xlb/helper/simulation_manager.py b/xlb/helper/simulation_manager.py
index 19a2f45d..39085b54 100644
--- a/xlb/helper/simulation_manager.py
+++ b/xlb/helper/simulation_manager.py
@@ -205,10 +205,248 @@ def recursion_fused_finest(level, app):
                 timestep=0,
             )
 
+        def recursion_fused_finest_254(level, app):
+            if level < 0:
+                return
+
+            # Compute omega at the current level
+            omega = self.omega_list[level]
+
+            if level == 0:
+                print(f"RECURSION down to the finest level {level}")
+                print(f"RECURSION Level {level}, Fused STREAM and COLLIDE")
+                self.add_to_app(
+                    app=app,
+                    op_name="finest_fused_pull_no_254",
+                    level=level,
+                    f_0_fd=self.f_0,
+                    f_1_fd=self.f_1,
+                    bc_mask_fd=self.bc_mask,
+                    missing_mask_fd=self.missing_mask,
+                    omega=omega,
+                    timestep=0,
+                    is_f1_the_explosion_src_field=True,
+                )
+                self.add_to_app(
+                    app=app,
+                    op_name="finest_fused_pull_254",
+                    level=level,
+                    f_0_fd=self.f_0,
+                    f_1_fd=self.f_1,
+                    bc_mask_fd=self.bc_mask,
+                    missing_mask_fd=self.missing_mask,
+                    omega=omega
+                )
+                self.add_to_app(
+                    app=app,
+                    op_name="finest_fused_pull_no_254",
+                    level=level,
+                    f_0_fd=self.f_1,
+                    f_1_fd=self.f_0,
+                    bc_mask_fd=self.bc_mask,
+                    missing_mask_fd=self.missing_mask,
+                    omega=omega,
+                    timestep=0,
+                    is_f1_the_explosion_src_field=False,
+                )
+                self.add_to_app(
+                    app=app,
+                    op_name="finest_fused_pull_254",
+                    level=level,
+                    f_0_fd=self.f_1,
+                    f_1_fd=self.f_0,
+                    bc_mask_fd=self.bc_mask,
+                    missing_mask_fd=self.missing_mask,
+                    omega=omega,
+                )
+                return
+
+            print(f"RECURSION down to level {level}")
+            print(f"RECURSION Level {level}, COLLIDE")
+
+            self.add_to_app(
+                app=app,
+                op_name="collide_coarse",
+                level=level,
+                f_0_fd=self.f_0,
+                f_1_fd=self.f_1,
+                bc_mask_fd=self.bc_mask,
+                missing_mask_fd=self.missing_mask,
+                omega=omega,
+                timestep=0,
+            )
+            # 1. Accumulation is read from f_0 in the streaming step, where f_0=self.f_1.
+            # so is_self_f1_the_coalescence_dst_field is True
+            # 2. Explision data is the output from the corser collide, which is f_1=self.f_1.
+            # so is_self_f1_the_explosion_src_field is True
+
+            if level - 1 == 0:
+                recursion_fused_finest_254(level - 1, app)
+            else:
+                recursion_fused_finest_254(level - 1, app)
+                recursion_fused_finest_254(level - 1, app)
+            # Important: swapping of f_0 and f_1 is done here
+            print(f"RECURSION Level {level}, stream_coarse_step_ABC")
+            self.add_to_app(
+                app=app,
+                op_name="stream_coarse_step_ABC",
+                level=level,
+                f_0_fd=self.f_1,
+                f_1_fd=self.f_0,
+                bc_mask_fd=self.bc_mask,
+                missing_mask_fd=self.missing_mask,
+                omega=self.coalescence_factor,
+                timestep=0,
+            )
+
+        def recursion_fused_finest_254_all(level, app):
+            if level < 0:
+                return
+
+            # Compute omega at the current level
+            omega = self.omega_list[level]
+
+            if level == 0:
+                print(f"RECURSION down to the finest level {level}")
+                print(f"RECURSION Level {level}, Fused STREAM and COLLIDE")
+                self.add_to_app(
+                    app=app,
+                    op_name="finest_fused_pull_no_254",
+                    level=level,
+                    f_0_fd=self.f_0,
+                    f_1_fd=self.f_1,
+                    bc_mask_fd=self.bc_mask,
+                    missing_mask_fd=self.missing_mask,
+                    omega=omega,
+                    timestep=0,
+                    is_f1_the_explosion_src_field=True,
+                )
+                self.add_to_app(
+                    app=app,
+                    op_name="finest_fused_pull_254",
+                    level=level,
+                    f_0_fd=self.f_0,
+                    f_1_fd=self.f_1,
+                    bc_mask_fd=self.bc_mask,
+                    missing_mask_fd=self.missing_mask,
+                    omega=omega
+                )
+                self.add_to_app(
+                    app=app,
+                    op_name="finest_fused_pull_no_254",
+                    level=level,
+                    f_0_fd=self.f_1,
+                    f_1_fd=self.f_0,
+                    bc_mask_fd=self.bc_mask,
+                    missing_mask_fd=self.missing_mask,
+                    omega=omega,
+                    timestep=0,
+                    is_f1_the_explosion_src_field=False,
+                )
+                self.add_to_app(
+                    app=app,
+                    op_name="finest_fused_pull_254",
+                    level=level,
+                    f_0_fd=self.f_1,
+                    f_1_fd=self.f_0,
+                    bc_mask_fd=self.bc_mask,
+                    missing_mask_fd=self.missing_mask,
+                    omega=omega,
+                )
+                return
+
+            print(f"RECURSION down to level {level}")
+            print(f"RECURSION Level {level}, COLLIDE")
+
+            self.add_to_app(
+                app=app,
+                op_name="collide_coarse_no_254",
+                level=level,
+                f_0_fd=self.f_0,
+                f_1_fd=self.f_1,
+                bc_mask_fd=self.bc_mask,
+                missing_mask_fd=self.missing_mask,
+                omega=omega,
+                timestep=0,
+            )
+            self.add_to_app(
+                app=app,
+                op_name="collide_coarse_254",
+                level=level,
+                f_0_fd=self.f_0,
+                f_1_fd=self.f_1,
+                bc_mask_fd=self.bc_mask,
+                missing_mask_fd=self.missing_mask,
+                omega=omega,
+                timestep=0,
+            )
+            # 1. Accumulation is read from f_0 in the streaming step, where f_0=self.f_1.
+            # so is_self_f1_the_coalescence_dst_field is True
+            # 2. Explision data is the output from the corser collide, which is f_1=self.f_1.
+            # so is_self_f1_the_explosion_src_field is True
+
+            if level - 1 == 0:
+                recursion_fused_finest_254_all(level - 1, app)
+            else:
+                recursion_fused_finest_254_all(level - 1, app)
+                recursion_fused_finest_254_all(level - 1, app)
+            # Important: swapping of f_0 and f_1 is done here
+            print(f"RECURSION Level {level}, stream_coarse_step_ABC")
+            self.add_to_app(
+                app=app,
+                op_name="stream_coarse_step_ABC_no_254",
+                level=level,
+                f_0_fd=self.f_1,
+                f_1_fd=self.f_0,
+                bc_mask_fd=self.bc_mask,
+                missing_mask_fd=self.missing_mask,
+                omega=self.coalescence_factor,
+                timestep=0,
+            )
+            self.add_to_app(
+                app=app,
+                op_name="stream_coarse_step_254",
+                level=level,
+                f_0_fd=self.f_1,
+                f_1_fd=self.f_0,
+                bc_mask_fd=self.bc_mask,
+                missing_mask_fd=self.missing_mask,
+            )
+            return
+
         if self.mres_perf_opt == MresPerfOptimizationType.NAIVE_COLLIDE_STREAM:
             recursion_reference(self.count_levels - 1, app=self.app)
         elif self.mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST:
             recursion_fused_finest(self.count_levels - 1, app=self.app)
+        elif self.mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST_254:
+            # Run kernel that generates teh 254 value in the bc_mask
+            wp.synchronize()
+            self.bc_mask.update_host(0)
+            wp.synchronize()
+            self.bc_mask.export_vti(f"mask_before.vti", "u")
+
+            self.neon_container['reset_bc_mask_for_no_mr_no_bc_as_254'](0, self.f_0, self.f_1, self.bc_mask, self.bc_mask).run(0)
+            wp.synchronize()
+            self.bc_mask.update_host(0)
+            wp.synchronize()
+            self.bc_mask.export_vti(f"mask_after.vti", "u")
+            recursion_fused_finest_254(self.count_levels - 1, app=self.app)
+        elif self.mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST_254_ALL:
+            # Run kernel that generates teh 254 value in the bc_mask
+            wp.synchronize()
+            self.bc_mask.update_host(0)
+            wp.synchronize()
+            self.bc_mask.export_vti(f"mask_before.vti", "u")
+
+            num_levels =  self.f_0.get_grid().num_levels
+            for l in range(num_levels):
+                self.neon_container['reset_bc_mask_for_no_mr_no_bc_as_254'](l, self.f_0, self.f_1, self.bc_mask,
+                                                                        self.bc_mask).run(0)
+            wp.synchronize()
+            self.bc_mask.update_host(0)
+            wp.synchronize()
+            self.bc_mask.export_vti(f"mask_after.vti", "u")
+            recursion_fused_finest_254_all(self.count_levels - 1, app=self.app)
         else:
             raise ValueError(f"Unknown optimization level: {self.opt_level}")
 
diff --git a/xlb/mres_perf_optimization_type.py b/xlb/mres_perf_optimization_type.py
index 622982ff..ae14bcf1 100644
--- a/xlb/mres_perf_optimization_type.py
+++ b/xlb/mres_perf_optimization_type.py
@@ -12,6 +12,8 @@ class MresPerfOptimizationType(Enum):
 
     NAIVE_COLLIDE_STREAM = auto()
     FUSION_AT_FINEST = auto()
+    FUSION_AT_FINEST_254 = auto()
+    FUSION_AT_FINEST_254_ALL = auto()
 
     @staticmethod
     def from_string(value: str) -> "MresPerfOptimizationType":
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index d18257f4..c4f9db2e 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -198,9 +198,9 @@ def compute(index: Any):
                                 if coalescence_factor > self.compute_dtype(0):
                                     coalescence_factor = self.compute_dtype(1) / (self.compute_dtype(2) * coalescence_factor)
                                     wp.neon_write(coalescence_factor_pn, index, l, coalescence_factor)
-
-                            else:
-                                wp.print("ERRRRRRORRRRRRRRRRRRRR")
+                            #
+                            # else:
+                            #     wp.print("ERRRRRRORRRRRRRRRRRRRR")
 
                 loader.declare_kernel(compute)
 
@@ -450,6 +450,136 @@ def device(index: Any):
 
             return ll_collide_coarse
 
+        @neon.Container.factory(name="collide_coarse_254")
+        def collide_coarse_254(
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+            timestep: int,
+        ):
+            num_levels = f_0_fd.get_grid().num_levels
+
+            def ll_collide_coarse(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+
+
+                f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                f_1_pn = loader.get_mres_write_handle(f_1_fd)
+
+                # fake loading to enforce sequential step
+
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
+
+                _c = self.velocity_set.c
+                _w = self.velocity_set.w
+
+                @wp.func
+                def device(index: Any):
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    """
+                    The c++ version starts with the following, which I am not sure is right:
+                        if (type(cell, 0) == CellType::bulk ) {
+                    BC type cells should do collide too
+                    """
+                    if _boundary_id != wp.uint8(254):
+                        return
+
+                    # Read thread data for populations, these are post streaming
+                    _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
+                    _f_post_stream = _f0_thread
+
+                    _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
+                    _feq = self.equilibrium.neon_functional(_rho, _u)
+                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
+
+                    for l in range(self.velocity_set.q):
+                        wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+
+                loader.declare_kernel(device)
+
+            return ll_collide_coarse
+
+        @neon.Container.factory(name="no_254_collide_coarse")
+        def collide_coarse_no_254(
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+            timestep: int,
+        ):
+            num_levels = f_0_fd.get_grid().num_levels
+
+            def ll_collide_coarse(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+
+                if level + 1 < f_0_fd.get_grid().num_levels:
+                    f_0_pn = loader.get_mres_write_handle(f_0_fd, neon.Loader.Operation.stencil_up)
+                    f_1_pn = loader.get_mres_write_handle(f_1_fd, neon.Loader.Operation.stencil_up)
+                else:
+                    f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                    f_1_pn = loader.get_mres_write_handle(f_1_fd)
+
+                # fake loading to enforce sequential step
+
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
+
+                _c = self.velocity_set.c
+                _w = self.velocity_set.w
+
+                @wp.func
+                def device(index: Any):
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    """
+                    The c++ version starts with the following, which I am not sure is right:
+                        if (type(cell, 0) == CellType::bulk ) {
+                    BC type cells should do collide too
+                    """
+                    if _boundary_id == wp.uint8(255):
+                        return
+                    if _boundary_id == wp.uint8(254):
+                        return
+                    if not wp.neon_has_child(f_0_pn, index):
+                        # Read thread data for populations, these are post streaming
+                        _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
+                        _f_post_stream = _f0_thread
+
+                        _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
+                        _feq = self.equilibrium.neon_functional(_rho, _u)
+                        _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
+
+                        # Apply post-collision boundary conditions
+                        _f_post_collision = apply_bc(
+                            index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False
+                        )
+
+                        # Apply auxiliary recovery for boundary conditions (swapping) before overwriting f_1
+                        neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
+
+                        # Accumulate the post-collision populations in f_0
+                        for l in range(self.velocity_set.q):
+                            push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
+                            if level < num_levels - 1:
+                                val = _f_post_collision[l]
+                                wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, val)
+                                # Verified that this is not needed: wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, val)
+
+                            wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+                    else:
+                        for l in range(self.velocity_set.q):
+                            wp.neon_write(f_1_pn, index, l, self.compute_dtype(0))
+                            # Verified that this is not needed: wp.neon_write(f_0_pn, index, l, self.compute_dtype(0))
+
+                loader.declare_kernel(device)
+
+            return ll_collide_coarse
+
         @neon.Container.factory(name="stream_coarse_step_ABC")
         def stream_coarse_step_ABC(
             level: int,
@@ -550,8 +680,8 @@ def cl_stream_coarse(index: Any):
                                 accumulated = accumulated * coalescence_factor
                                 # wp.neon_write(f_1_pn, index, l, accumulated)
                                 _f_post_stream[l] = accumulated
-                            else:
-                                wp.print("ERRRRRRORRRRRRRRRRRRRR")
+                            # else:
+                            #     wp.print("ERRRRRRORRRRRRRRRRRRRR")
 
                     # do non mres post-streaming corrections
                     _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
@@ -566,22 +696,25 @@ def cl_stream_coarse(index: Any):
 
             return ll_stream_coarse
 
-        @neon.Container.factory(name="stream_coarse_step_A")
-        def stream_coarse_step_A(
-            level: int,
-            f_0_fd: Any,
-            f_1_fd: Any,
-            bc_mask_fd: Any,
-            missing_mask_fd: Any,
-            omega: Any,
-            timestep: int,
+        @neon.Container.factory(name="no_254_stream_coarse_step_ABC")
+        def stream_coarse_step_ABC_no_254(
+                level: int,
+                f_0_fd: Any,
+                f_1_fd: Any,
+                bc_mask_fd: Any,
+                missing_mask_fd: Any,
+                omega: Any,
+                timestep: int,
         ):
-            num_levels = f_0_fd.get_grid().get_num_levels()
+            num_levels = f_0_fd.get_grid().num_levels
 
             # if level != 0:
             #     # throw an exception
             #     raise Exception("Only the finest level is supported for now")
 
+            # module op to define odd of even iteration
+            # od_or_even = wp.module("odd_or_even", "even")
+
             def ll_stream_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
@@ -593,9 +726,15 @@ def ll_stream_coarse(loader: neon.Loader):
 
                 _c = self.velocity_set.c
 
+                coalescence_factor_fd = omega
+                coalescence_factor_pn = loader.get_mres_read_handle(coalescence_factor_fd)
+
                 @wp.func
                 def cl_stream_coarse(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+
+                    if _boundary_id == wp.uint8(254):
+                        return
                     if _boundary_id == wp.uint8(255):
                         return
 
@@ -610,48 +749,6 @@ def cl_stream_coarse(index: Any):
                     _f_post_collision = _f0_thread
                     _f_post_stream = self.stream.neon_functional(f_0_pn, index)
 
-                    for l in range(self.velocity_set.q):
-                        wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
-                    # wp.print("stream_coarse")
-
-                loader.declare_kernel(cl_stream_coarse)
-
-            return ll_stream_coarse
-
-        @neon.Container.factory(name="stream_coarse_step_B")
-        def stream_coarse_step_B(
-            level: int,
-            f_0_fd: Any,
-            f_1_fd: Any,
-            bc_mask_fd: Any,
-            missing_mask_fd: Any,
-            omega: Any,
-            timestep: int,
-        ):
-            def ll_stream_coarse(loader: neon.Loader):
-                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
-                coalescence_factor_fd = omega
-                f_0_pn = loader.get_mres_read_handle(f_0_fd)
-                f_1_pn = loader.get_mres_write_handle(f_1_fd)
-
-                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
-                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
-                coalescence_factor_pn = loader.get_mres_read_handle(coalescence_factor_fd)
-
-                _c = self.velocity_set.c
-                _w = self.velocity_set.w
-
-                @wp.func
-                def cl_stream_coarse(index: Any):
-                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    if _boundary_id == wp.uint8(255):
-                        return
-
-                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
-                    if are_we_a_halo_cell:
-                        # HERE: we are a halo cell so we just exit
-                        return
-
                     for l in range(self.velocity_set.q):
                         if l == lattice_central_index:
                             # HERE, we skip the center direction
@@ -660,7 +757,8 @@ def cl_stream_coarse(index: Any):
                         pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
 
                         has_ngh_at_same_level = wp.bool(False)
-                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
+                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0),
+                                                       has_ngh_at_same_level)
 
                         # if (!pin.hasChildren(cell, dir)) {
                         if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
@@ -681,7 +779,8 @@ def cl_stream_coarse(index: Any):
                                         # YES ghost cell on top of us
                                         # YES courser ngh.
                                         # -> **Explosion**
-                                        wp.neon_write(f_1_pn, index, l, exploded_pop)
+                                        # wp.neon_write(f_1_pn, index, l, exploded_pop)
+                                        _f_post_stream[l] = exploded_pop
                         else:
                             # HERE -> I have a finer ngh. in direction pull (opposite l)
                             # Then I have to read from the halo on top of my finer ngh.
@@ -699,31 +798,34 @@ def cl_stream_coarse(index: Any):
                                 # -> **Coalescence**
                                 coalescence_factor = wp.neon_read(coalescence_factor_pn, index, l)
                                 accumulated = accumulated * coalescence_factor
-                                wp.neon_write(f_1_pn, index, l, accumulated)
+                                # wp.neon_write(f_1_pn, index, l, accumulated)
+                                _f_post_stream[l] = accumulated
+                            # else:
+                            #     wp.print("ERRRRRRORRRRRRRRRRRRRR")
 
-                            else:
-                                wp.print("ERRRRRRORRRRRRRRRRRRRR")
+                    # do non mres post-streaming corrections
+                    _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn,
+                                              _f_post_collision, _f_post_stream, True)
+
+                    # Apply auxiliary recovery for boundary conditions (swapping) before overwriting f_1
+                    neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
+
+                    for l in range(self.velocity_set.q):
+                        wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
 
                 loader.declare_kernel(cl_stream_coarse)
 
             return ll_stream_coarse
 
-        @neon.Container.factory(name="finest_fused_pull")
-        def finest_fused_pull(
+        @neon.Container.factory(name="reset_bc_mask_for_no_mr_no_bc_as_254")
+        def reset_bc_mask_for_no_mr_no_bc_as_254(
             level: int,
             f_0_fd: Any,
             f_1_fd: Any,
             bc_mask_fd: Any,
             missing_mask_fd: Any,
-            omega: Any,
-            timestep: Any,
-            is_f1_the_explosion_src_field: bool,
         ):
-            if level != 0:
-                # throw an exception
-                raise Exception("Only the finest level is supported for now")
-            grid = f_0_fd.get_grid()
-            num_levels = grid.num_levels
+            num_levels = f_0_fd.get_grid().num_levels
 
             # if level != 0:
             #     # throw an exception
@@ -732,27 +834,25 @@ def finest_fused_pull(
             # module op to define odd of even iteration
             # od_or_even = wp.module("odd_or_even", "even")
 
-            def finest_fused_pull_launcher(loader: neon.Loader):
+            def ll_stream_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
-                if level + 1 < f_0_fd.get_grid().num_levels:
-                    f_0_pn = loader.get_mres_write_handle(f_0_fd, neon.Loader.Operation.stencil_up)
-                    f_1_pn = loader.get_mres_write_handle(f_1_fd, neon.Loader.Operation.stencil_up)
-                else:
-                    f_0_pn = loader.get_mres_read_handle(f_0_fd)
-                    f_1_pn = loader.get_mres_write_handle(f_1_fd)
+                f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                f_1_pn = loader.get_mres_write_handle(f_1_fd)
 
                 bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
                 missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
 
                 _c = self.velocity_set.c
-                _w = self.velocity_set.w
+
 
                 @wp.func
-                def finest_fused_pull_kernel(index: Any):
+                def cl_stream_coarse(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id == wp.uint8(255):
                         return
+                    if _boundary_id != 0:
+                        return
 
                     are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
                     if are_we_a_halo_cell:
@@ -775,26 +875,455 @@ def finest_fused_pull_kernel(index: Any):
                         has_ngh_at_same_level = wp.bool(False)
                         accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
 
-                        # NO finer ngh. in the pull direction (opposite of l)
-                        if not has_ngh_at_same_level:
-                            # NO ngh. at the same level
-                            # COULD we have a ngh. at the courser level?
-                            if wp.neon_has_parent(f_0_pn, index):
-                                # YES halo cell on top of us
-                                has_a_coarser_ngh = wp.bool(False)
-                                if is_f1_the_explosion_src_field:
-                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(
-                                        f_1_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
-                                    )
-                                else:
+                        # if (!pin.hasChildren(cell, dir)) {
+                        if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
+                            # NO finer ngh. in the pull direction (opposite of l)
+                            if not has_ngh_at_same_level:
+                                # NO ngh. at the same level
+                                # COULD we have a ngh. at the courser level?
+                                if wp.neon_has_parent(f_0_pn, index):
+                                    # YES halo cell on top of us
+                                    has_a_coarser_ngh = wp.bool(False)
                                     exploded_pop = wp.neon_lbm_read_coarser_ngh(
                                         f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
                                     )
-                                if has_a_coarser_ngh:
-                                    # Full state:
-                                    # NO finer ngh. in the pull direction (opposite of l)
-                                    # NO ngh. at the same level
-                                    # YES ghost cell on top of us
+                                    if has_a_coarser_ngh:
+                                        # Full state:
+                                        # NO finer ngh. in the pull direction (opposite of l)
+                                        # NO ngh. at the same level
+                                        # YES ghost cell on top of us
+                                        # YES courser ngh.
+                                        # -> **Explosion**
+                                        # wp.neon_write(f_1_pn, index, l, exploded_pop)
+                                        return
+                        else:
+                            # HERE -> I have a finer ngh. in direction pull (opposite l)
+                            # Then I have to read from the halo on top of my finer ngh.
+                            if has_ngh_at_same_level:
+                                # if l == 10:
+                                #     wp.print(accumulated)
+                                #     glob = wp.neon_global_idx(f_1_pn, index)
+                                #     wp.neon_cuda_info()
+                                #     wp.neon_print(glob)
+                                #     wp.neon_level(f_1_pn)
+                                # accumulated = _w[l]
+                                # Full State
+                                # YES finer ngh. in the pull direction (opposite of l)
+                                # YES ngh. at the same level
+                                # -> **Coalescence**
+                                return
+                            else:
+                                wp.print("ERRRRRRORRRRRRRRRRRRRR")
+
+                    # Only fluid voxels with the following properties can reach this line:
+                    # They are not BC voxels
+                    # They are not on a resolution jump -> they do not do coalescence or explosion
+                    # They are not mr halo cells
+                    wp.neon_write(bc_mask_pn, index, 0, wp.uint8(254))
+
+
+                loader.declare_kernel(cl_stream_coarse)
+
+            return ll_stream_coarse
+
+        @neon.Container.factory(name="stream_coarse_step_A")
+        def stream_coarse_step_A(
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+            timestep: int,
+        ):
+            num_levels = f_0_fd.get_grid().get_num_levels()
+
+            # if level != 0:
+            #     # throw an exception
+            #     raise Exception("Only the finest level is supported for now")
+
+            def ll_stream_coarse(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+
+                f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                f_1_pn = loader.get_mres_write_handle(f_1_fd)
+
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
+
+                _c = self.velocity_set.c
+
+                @wp.func
+                def cl_stream_coarse(index: Any):
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    if _boundary_id == wp.uint8(255):
+                        return
+
+                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
+                    if are_we_a_halo_cell:
+                        # HERE: we are a halo cell so we just exit
+                        return
+
+                    # do stream normally
+                    _missing_mask = _missing_mask_vec()
+                    _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
+                    _f_post_collision = _f0_thread
+                    _f_post_stream = self.stream.neon_functional(f_0_pn, index)
+
+                    for l in range(self.velocity_set.q):
+                        wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
+                    # wp.print("stream_coarse")
+
+                loader.declare_kernel(cl_stream_coarse)
+
+            return ll_stream_coarse
+
+        @neon.Container.factory(name="stream_coarse_step_254")
+        def stream_coarse_step_254(
+                level: int,
+                f_0_fd: Any,
+                f_1_fd: Any,
+                bc_mask_fd: Any,
+                missing_mask_fd: Any
+        ):
+
+            def ll_stream_coarse(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+
+                f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                f_1_pn = loader.get_mres_write_handle(f_1_fd)
+
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
+
+                _c = self.velocity_set.c
+
+                @wp.func
+                def cl_stream_coarse(index: Any):
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    if _boundary_id != wp.uint8(254):
+                        return
+
+                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
+                    if are_we_a_halo_cell:
+                        # HERE: we are a halo cell so we just exit
+                        return
+
+                    # do stream normally
+                    _missing_mask = _missing_mask_vec()
+                    _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
+                    _f_post_collision = _f0_thread
+                    _f_post_stream = self.stream.neon_functional(f_0_pn, index)
+
+                    for l in range(self.velocity_set.q):
+                        wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
+                    # wp.print("stream_coarse")
+
+                loader.declare_kernel(cl_stream_coarse)
+
+            return ll_stream_coarse
+
+        @neon.Container.factory(name="stream_coarse_step_B")
+        def stream_coarse_step_B(
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+        ):
+            def ll_stream_coarse(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+                coalescence_factor_fd = omega
+                f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                f_1_pn = loader.get_mres_write_handle(f_1_fd)
+
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
+                coalescence_factor_pn = loader.get_mres_read_handle(coalescence_factor_fd)
+
+                _c = self.velocity_set.c
+                _w = self.velocity_set.w
+
+                @wp.func
+                def cl_stream_coarse(index: Any):
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    if _boundary_id == wp.uint8(255):
+                        return
+
+                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
+                    if are_we_a_halo_cell:
+                        # HERE: we are a halo cell so we just exit
+                        return
+
+                    for l in range(self.velocity_set.q):
+                        if l == lattice_central_index:
+                            # HERE, we skip the center direction
+                            continue
+
+                        pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
+
+                        has_ngh_at_same_level = wp.bool(False)
+                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
+
+                        # if (!pin.hasChildren(cell, dir)) {
+                        if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
+                            # NO finer ngh. in the pull direction (opposite of l)
+                            if not has_ngh_at_same_level:
+                                # NO ngh. at the same level
+                                # COULD we have a ngh. at the courser level?
+                                if wp.neon_has_parent(f_0_pn, index):
+                                    # YES halo cell on top of us
+                                    has_a_coarser_ngh = wp.bool(False)
+                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(
+                                        f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
+                                    )
+                                    if has_a_coarser_ngh:
+                                        # Full state:
+                                        # NO finer ngh. in the pull direction (opposite of l)
+                                        # NO ngh. at the same level
+                                        # YES ghost cell on top of us
+                                        # YES courser ngh.
+                                        # -> **Explosion**
+                                        wp.neon_write(f_1_pn, index, l, exploded_pop)
+                        else:
+                            # HERE -> I have a finer ngh. in direction pull (opposite l)
+                            # Then I have to read from the halo on top of my finer ngh.
+                            if has_ngh_at_same_level:
+                                # if l == 10:
+                                #     wp.print(accumulated)
+                                #     glob = wp.neon_global_idx(f_1_pn, index)
+                                #     wp.neon_cuda_info()
+                                #     wp.neon_print(glob)
+                                #     wp.neon_level(f_1_pn)
+                                # accumulated = _w[l]
+                                # Full State
+                                # YES finer ngh. in the pull direction (opposite of l)
+                                # YES ngh. at the same level
+                                # -> **Coalescence**
+                                coalescence_factor = wp.neon_read(coalescence_factor_pn, index, l)
+                                accumulated = accumulated * coalescence_factor
+                                wp.neon_write(f_1_pn, index, l, accumulated)
+                            #
+                            # else:
+                            #     wp.print("ERRRRRRORRRRRRRRRRRRRR")
+
+                loader.declare_kernel(cl_stream_coarse)
+
+            return ll_stream_coarse
+
+        @neon.Container.factory(name="finest_fused_pull")
+        def finest_fused_pull(
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+            timestep: Any,
+            is_f1_the_explosion_src_field: bool,
+        ):
+            if level != 0:
+                # throw an exception
+                raise Exception("Only the finest level is supported for now")
+            grid = f_0_fd.get_grid()
+            num_levels = grid.num_levels
+
+            # if level != 0:
+            #     # throw an exception
+            #     raise Exception("Only the finest level is supported for now")
+
+            # module op to define odd of even iteration
+            # od_or_even = wp.module("odd_or_even", "even")
+
+            def finest_fused_pull_launcher(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+
+                if level + 1 < f_0_fd.get_grid().num_levels:
+                    f_0_pn = loader.get_mres_write_handle(f_0_fd, neon.Loader.Operation.stencil_up)
+                    f_1_pn = loader.get_mres_write_handle(f_1_fd, neon.Loader.Operation.stencil_up)
+                else:
+                    f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                    f_1_pn = loader.get_mres_write_handle(f_1_fd)
+
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
+
+                _c = self.velocity_set.c
+                _w = self.velocity_set.w
+
+                @wp.func
+                def finest_fused_pull_kernel(index: Any):
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    if _boundary_id == wp.uint8(255):
+                        return
+
+                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
+                    if are_we_a_halo_cell:
+                        # HERE: we are a halo cell so we just exit
+                        return
+
+                    # do stream normally
+                    _missing_mask = _missing_mask_vec()
+                    _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
+                    _f_post_collision = _f0_thread
+                    _f_post_stream = self.stream.neon_functional(f_0_pn, index)
+
+                    for l in range(self.velocity_set.q):
+                        if l == lattice_central_index:
+                            # HERE, we skip the center direction
+                            continue
+
+                        pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
+
+                        has_ngh_at_same_level = wp.bool(False)
+                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
+
+                        # NO finer ngh. in the pull direction (opposite of l)
+                        if not has_ngh_at_same_level:
+                            # NO ngh. at the same level
+                            # COULD we have a ngh. at the courser level?
+                            if wp.neon_has_parent(f_0_pn, index):
+                                # YES halo cell on top of us
+                                has_a_coarser_ngh = wp.bool(False)
+                                if is_f1_the_explosion_src_field:
+                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(
+                                        f_1_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
+                                    )
+                                else:
+                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(
+                                        f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
+                                    )
+                                if has_a_coarser_ngh:
+                                    # Full state:
+                                    # NO finer ngh. in the pull direction (opposite of l)
+                                    # NO ngh. at the same level
+                                    # YES ghost cell on top of us
+                                    # YES courser ngh.
+                                    # -> **Explosion**
+                                    # wp.neon_write(f_1_pn, index, l, exploded_pop)
+                                    _f_post_stream[l] = exploded_pop
+
+                    # do non mres post-streaming corrections
+                    _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
+
+                    _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
+                    _feq = self.equilibrium.neon_functional(_rho, _u)
+                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
+
+                    # Apply post-collision boundary conditions
+                    _f_post_collision = apply_bc(
+                        index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False
+                    )
+
+                    # Apply auxiliary recovery for boundary conditions (swapping) before overwriting f_1
+                    neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
+
+                    # Accumulate the post-collision populations in f_0
+                    for l in range(self.velocity_set.q):
+                        push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
+                        if level < num_levels - 1:
+                            val = _f_post_collision[l]
+                            if is_f1_the_explosion_src_field:
+                                wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, val)
+                            else:
+                                wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, val)
+
+                        wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+
+                loader.declare_kernel(finest_fused_pull_kernel)
+
+            return finest_fused_pull_launcher
+
+        @neon.Container.factory(name="finest_fused_pull_no_254")
+        def finest_fused_pull_no_254(
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+            timestep:Any,
+            is_f1_the_explosion_src_field: bool,
+        ):
+            if level != 0:
+                # throw an exception
+                raise Exception("Only the finest level is supported for now")
+            grid = f_0_fd.get_grid()
+            num_levels = grid.num_levels
+
+            # if level != 0:
+            #     # throw an exception
+            #     raise Exception("Only the finest level is supported for now")
+
+            # module op to define odd of even iteration
+            # od_or_even = wp.module("odd_or_even", "even")
+
+            def finest_fused_pull_launcher(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+
+                if level + 1 < f_0_fd.get_grid().num_levels:
+                    f_0_pn = loader.get_mres_write_handle(f_0_fd, neon.Loader.Operation.stencil_up)
+                    f_1_pn = loader.get_mres_write_handle(f_1_fd, neon.Loader.Operation.stencil_up)
+                else:
+                    f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                    f_1_pn = loader.get_mres_write_handle(f_1_fd)
+
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
+
+                _c = self.velocity_set.c
+                _w = self.velocity_set.w
+
+                @wp.func
+                def finest_fused_pull_kernel(index: Any):
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    if _boundary_id == wp.uint8(255):
+                        return
+                    if _boundary_id == wp.uint8(254):
+                        return
+
+                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
+                    if are_we_a_halo_cell:
+                        # HERE: we are a halo cell so we just exit
+                        return
+
+                    # do stream normally
+                    _missing_mask = _missing_mask_vec()
+                    _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
+                    _f_post_collision = _f0_thread
+                    _f_post_stream = self.stream.neon_functional(f_0_pn, index)
+
+                    for l in range(self.velocity_set.q):
+                        if l == lattice_central_index:
+                            # HERE, we skip the center direction
+                            continue
+
+                        pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
+
+                        has_ngh_at_same_level = wp.bool(False)
+                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
+
+                        # NO finer ngh. in the pull direction (opposite of l)
+                        if not has_ngh_at_same_level:
+                            # NO ngh. at the same level
+                            # COULD we have a ngh. at the courser level?
+                            if wp.neon_has_parent(f_0_pn, index):
+                                # YES halo cell on top of us
+                                has_a_coarser_ngh = wp.bool(False)
+                                if is_f1_the_explosion_src_field:
+                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(
+                                        f_1_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
+                                    )
+                                else:
+                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(
+                                        f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
+                                    )
+                                if has_a_coarser_ngh:
+                                    # Full state:
+                                    # NO finer ngh. in the pull direction (opposite of l)
+                                    # NO ngh. at the same level
+                                    # YES ghost cell on top of us
                                     # YES courser ngh.
                                     # -> **Explosion**
                                     # wp.neon_write(f_1_pn, index, l, exploded_pop)
@@ -831,6 +1360,61 @@ def finest_fused_pull_kernel(index: Any):
 
             return finest_fused_pull_launcher
 
+        @neon.Container.factory(name="254_finest_fused_pull")
+        def finest_fused_pull_254(
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+        ):
+            if level != 0:
+                # throw an exception
+                raise Exception("Only the finest level is supported for now")
+            grid = f_0_fd.get_grid()
+            num_levels = grid.num_levels
+
+            if level != 0:
+                # throw an exception
+                raise Exception("Only the finest level is supported for now")
+
+            def finest_fused_pull_launcher(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+
+                f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                f_1_pn = loader.get_mres_write_handle(f_1_fd)
+
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
+
+                _c = self.velocity_set.c
+                _w = self.velocity_set.w
+
+                @wp.func
+                def finest_fused_pull_kernel_254(index: Any):
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    if _boundary_id != wp.uint8(254):
+                        return
+
+
+                    # do stream normally
+                    _missing_mask = _missing_mask_vec()
+                    _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
+                    _f_post_collision = _f0_thread
+                    _f_post_stream = self.stream.neon_functional(f_0_pn, index)
+
+                    _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
+                    _feq = self.equilibrium.neon_functional(_rho, _u)
+                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
+
+                    for l in range(self.velocity_set.q):
+                        wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+
+                loader.declare_kernel(finest_fused_pull_kernel_254)
+
+            return finest_fused_pull_launcher
+
         @neon.Container.factory(name="stream_coarse_step_C")
         def stream_coarse_step_C(
             level: int,
@@ -889,6 +1473,13 @@ def cl_stream_coarse(index: Any):
             "stream_coarse_step_B": stream_coarse_step_B,
             "stream_coarse_step_C": stream_coarse_step_C,
             "finest_fused_pull": finest_fused_pull,
+            "finest_fused_pull_no_254": finest_fused_pull_no_254,
+            "finest_fused_pull_254": finest_fused_pull_254,
+            "reset_bc_mask_for_no_mr_no_bc_as_254":reset_bc_mask_for_no_mr_no_bc_as_254,
+            "collide_coarse_no_254":collide_coarse_no_254,
+            "collide_coarse_254":collide_coarse_254,
+            "stream_coarse_step_ABC_no_254":stream_coarse_step_ABC_no_254,
+            "stream_coarse_step_254":stream_coarse_step_254,
         }
 
     def launch_container(self, streamId, op_name, mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep):

From 77c4f3e4c9a831b5dd6eab128c3acdc3f8ebab9d Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Tue, 16 Dec 2025 14:12:27 +0100
Subject: [PATCH 193/208] (refactoring) Removing debug IO

---
 xlb/helper/simulation_manager.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/xlb/helper/simulation_manager.py b/xlb/helper/simulation_manager.py
index 39085b54..64d403e7 100644
--- a/xlb/helper/simulation_manager.py
+++ b/xlb/helper/simulation_manager.py
@@ -421,31 +421,31 @@ def recursion_fused_finest_254_all(level, app):
         elif self.mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST_254:
             # Run kernel that generates teh 254 value in the bc_mask
             wp.synchronize()
-            self.bc_mask.update_host(0)
-            wp.synchronize()
-            self.bc_mask.export_vti(f"mask_before.vti", "u")
+            # self.bc_mask.update_host(0)
+            # wp.synchronize()
+            # self.bc_mask.export_vti(f"mask_before.vti", "u")
 
             self.neon_container['reset_bc_mask_for_no_mr_no_bc_as_254'](0, self.f_0, self.f_1, self.bc_mask, self.bc_mask).run(0)
             wp.synchronize()
-            self.bc_mask.update_host(0)
-            wp.synchronize()
-            self.bc_mask.export_vti(f"mask_after.vti", "u")
+            # self.bc_mask.update_host(0)
+            # wp.synchronize()
+            # self.bc_mask.export_vti(f"mask_after.vti", "u")
             recursion_fused_finest_254(self.count_levels - 1, app=self.app)
         elif self.mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST_254_ALL:
             # Run kernel that generates teh 254 value in the bc_mask
             wp.synchronize()
-            self.bc_mask.update_host(0)
-            wp.synchronize()
-            self.bc_mask.export_vti(f"mask_before.vti", "u")
+            # self.bc_mask.update_host(0)
+            # wp.synchronize()
+            # self.bc_mask.export_vti(f"mask_before.vti", "u")
 
             num_levels =  self.f_0.get_grid().num_levels
             for l in range(num_levels):
                 self.neon_container['reset_bc_mask_for_no_mr_no_bc_as_254'](l, self.f_0, self.f_1, self.bc_mask,
                                                                         self.bc_mask).run(0)
+            # wp.synchronize()
+            # self.bc_mask.update_host(0)
             wp.synchronize()
-            self.bc_mask.update_host(0)
-            wp.synchronize()
-            self.bc_mask.export_vti(f"mask_after.vti", "u")
+            # self.bc_mask.export_vti(f"mask_after.vti", "u")
             recursion_fused_finest_254_all(self.count_levels - 1, app=self.app)
         else:
             raise ValueError(f"Unknown optimization level: {self.opt_level}")

From 2738780bc5980ec892661c5a9b0213d2ca9164c1 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Fri, 19 Dec 2025 17:08:50 +0100
Subject: [PATCH 194/208] (refactoring) Applying ruff.

---
 examples/cfd/grid_refinement/ahmed.py        |  2 +-
 examples/performance/mlups_3d.py             | 98 +++++++++++---------
 examples/performance/mlups_3d_multires.py    | 82 ++++++++--------
 xlb/helper/simulation_manager.py             | 11 +--
 xlb/operator/stepper/nse_multires_stepper.py | 76 ++++++---------
 5 files changed, 131 insertions(+), 138 deletions(-)

diff --git a/examples/cfd/grid_refinement/ahmed.py b/examples/cfd/grid_refinement/ahmed.py
index f6ef8b20..3b258870 100644
--- a/examples/cfd/grid_refinement/ahmed.py
+++ b/examples/cfd/grid_refinement/ahmed.py
@@ -56,7 +56,7 @@ def generate_cuboid_mesh(stl_filename, voxel_size):
     """
     # Domain multipliers for each refinement level
     domain_multiplier = [
-        [3.0, 4.0, 2.5, 2.5, 0.0, 4.0],     # -x, x, -y, y, -z, z
+        [3.0, 4.0, 2.5, 2.5, 0.0, 4.0],  # -x, x, -y, y, -z, z
         [1.2, 1.25, 1.75, 1.75, 0.0, 1.5],
         [0.8, 1.0, 1.25, 1.25, 0.0, 1.2],
         [0.5, 0.65, 0.6, 0.60, 0.0, 0.6],
diff --git a/examples/performance/mlups_3d.py b/examples/performance/mlups_3d.py
index 2b7a0ea1..66945e89 100644
--- a/examples/performance/mlups_3d.py
+++ b/examples/performance/mlups_3d.py
@@ -22,7 +22,7 @@ def parse_arguments():
     VELOCITY_SETS = ["D3Q19", "D3Q27"]
     COLLISION_MODELS = ["BGK", "KBC"]
     OCC_OPTIONS = ["standard", "none"]
-    
+
     parser = argparse.ArgumentParser(
         description="MLUPS Benchmark for 3D Lattice Boltzmann Method Simulation",
         epilog=f"""
@@ -32,41 +32,44 @@ def parse_arguments():
   %(prog)s 150 2000 neon fp32/fp32 --gpu_devices=[0,1,2] --measure_scalability --report
   %(prog)s 100 1000 neon fp32/fp32 --repetitions 5 --export_final_velocity
         """,
-        formatter_class=argparse.RawDescriptionHelpFormatter
+        formatter_class=argparse.RawDescriptionHelpFormatter,
     )
-    
+
     # Positional arguments
-    parser.add_argument("cube_edge", type=int, 
-                       help="Length of the edge of the cubic grid (e.g., 100)")
-    parser.add_argument("num_steps", type=int, 
-                       help="Number of timesteps for the simulation (e.g., 1000)")
-    parser.add_argument("compute_backend", type=str, 
-                       choices=COMPUTE_BACKENDS,
-                       help=f"Backend for the simulation ({', '.join(COMPUTE_BACKENDS)})")
-    parser.add_argument("precision", type=str, 
-                       choices=PRECISION_OPTIONS,
-                       help=f"Precision for the simulation ({', '.join(PRECISION_OPTIONS)})")
-    
+    parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid (e.g., 100)")
+    parser.add_argument("num_steps", type=int, help="Number of timesteps for the simulation (e.g., 1000)")
+    parser.add_argument("compute_backend", type=str, choices=COMPUTE_BACKENDS, help=f"Backend for the simulation ({', '.join(COMPUTE_BACKENDS)})")
+    parser.add_argument("precision", type=str, choices=PRECISION_OPTIONS, help=f"Precision for the simulation ({', '.join(PRECISION_OPTIONS)})")
+
     # Optional arguments
-    parser.add_argument("--gpu_devices", type=str, default=None,
-                       help="CUDA devices to use for Neon backend (e.g., [0,1,2] or [0])")
-    parser.add_argument("--velocity_set", type=str, default="D3Q19", 
-                       choices=VELOCITY_SETS,
-                       help=f"Lattice velocity set (default: D3Q19, choices: {', '.join(VELOCITY_SETS)})")
-    parser.add_argument("--collision_model", type=str, default="BGK", 
-                       choices=COLLISION_MODELS,
-                       help=f"Collision model (default: BGK, choices: {', '.join(COLLISION_MODELS)}, KBC requires D3Q27)")
-    parser.add_argument("--occ", type=str, default="standard", 
-                       choices=OCC_OPTIONS,
-                       help=f"Overlapping Communication and Computation strategy (default: standard, choices: {', '.join(OCC_OPTIONS)})")
-    parser.add_argument("--report", action="store_true", 
-                       help="Generate Neon performance report")
-    parser.add_argument("--export_final_velocity", action="store_true", 
-                       help="Export final velocity field to VTI file")
-    parser.add_argument("--measure_scalability", action="store_true", 
-                       help="Measure performance across different GPU counts")
-    parser.add_argument("--repetitions", type=int, default=1, metavar="N",
-                       help="Number of simulation repetitions for statistical analysis (default: 1)")
+    parser.add_argument("--gpu_devices", type=str, default=None, help="CUDA devices to use for Neon backend (e.g., [0,1,2] or [0])")
+    parser.add_argument(
+        "--velocity_set",
+        type=str,
+        default="D3Q19",
+        choices=VELOCITY_SETS,
+        help=f"Lattice velocity set (default: D3Q19, choices: {', '.join(VELOCITY_SETS)})",
+    )
+    parser.add_argument(
+        "--collision_model",
+        type=str,
+        default="BGK",
+        choices=COLLISION_MODELS,
+        help=f"Collision model (default: BGK, choices: {', '.join(COLLISION_MODELS)}, KBC requires D3Q27)",
+    )
+    parser.add_argument(
+        "--occ",
+        type=str,
+        default="standard",
+        choices=OCC_OPTIONS,
+        help=f"Overlapping Communication and Computation strategy (default: standard, choices: {', '.join(OCC_OPTIONS)})",
+    )
+    parser.add_argument("--report", action="store_true", help="Generate Neon performance report")
+    parser.add_argument("--export_final_velocity", action="store_true", help="Export final velocity field to VTI file")
+    parser.add_argument("--measure_scalability", action="store_true", help="Measure performance across different GPU counts")
+    parser.add_argument(
+        "--repetitions", type=int, default=1, metavar="N", help="Number of simulation repetitions for statistical analysis (default: 1)"
+    )
 
     args = parser.parse_args()
 
@@ -97,8 +100,9 @@ def parse_arguments():
         if args.gpu_devices is None:
             print("[INFO] No GPU devices specified. Using default device 0.")
             args.gpu_devices = [0]
-        
+
         import neon
+
         occ_enum = neon.SkeletonConfig.OCC.from_string(args.occ)
         args.occ_enum = occ_enum  # Store the enum for Neon
         args.occ_display = args.occ  # Store the original string for display
@@ -129,7 +133,6 @@ def parse_arguments():
         velocity_set = xlb.velocity_set.D3Q27(precision_policy=args.precision_policy, compute_backend=compute_backend)
     args.velocity_set = velocity_set
 
-
     print_args(args)
 
     return args
@@ -140,27 +143,27 @@ def print_args(args):
     print("\n" + "=" * 70)
     print("                    SIMULATION CONFIGURATION")
     print("=" * 70)
-    
+
     # Grid and simulation parameters
     print("GRID & SIMULATION:")
     print(f"  Grid Size:              {args.cube_edge}³ ({args.cube_edge:,} × {args.cube_edge:,} × {args.cube_edge:,})")
     print(f"  Total Lattice Points:   {args.cube_edge**3:,}")
     print(f"  Time Steps:             {args.num_steps:,}")
     print(f"  Repetitions:            {args.repetitions}")
-    
+
     # Computational settings
     print("\nCOMPUTATIONAL SETTINGS:")
     print(f"  Compute Backend:        {args.compute_backend.name}")
     print(f"  Precision Policy:       {args.precision}")
     print(f"  Velocity Set:           {args.velocity_set.__class__.__name__}")
     print(f"  Collision Model:        {args.collision_model}")
-    
+
     # Backend-specific settings
     if args.compute_backend.name == "NEON":
         print("\nNEON BACKEND SETTINGS:")
         print(f"  GPU Devices:            {args.gpu_devices}")
         print(f"  OCC Strategy:           {args.occ_display}")
-    
+
     # Output options
     print("\nOUTPUT OPTIONS:")
     print(f"  Generate Report:        {'Yes' if args.report else 'No'}")
@@ -184,7 +187,9 @@ def init_xlb(args):
     return args.compute_backend, args.precision_policy, options
 
 
-def run_simulation(compute_backend, precision_policy, grid_shape, num_steps, options, export_final_velocity, repetitions, num_devices, collision_model):
+def run_simulation(
+    compute_backend, precision_policy, grid_shape, num_steps, options, export_final_velocity, repetitions, num_devices, collision_model
+):
     grid = grid_factory(grid_shape, backend_config=options)
     box = grid.bounding_box_indices()
     box_no_edge = grid.bounding_box_indices(remove_edges=True)
@@ -262,6 +267,7 @@ def calculate_mlups(cube_edge, num_steps, elapsed_time):
     mlups = (total_lattice_updates / elapsed_time) / 1e6
     return mlups
 
+
 def print_summary_with_stats(args, stats):
     """Print comprehensive simulation summary with statistics from multiple repetitions"""
     total_lattice_points = args.cube_edge**3
@@ -337,6 +343,7 @@ def print_summary_with_stats(args, stats):
 
     print("=" * 70)
 
+
 def print_scalability_summary(args, stats_list):
     """Print comprehensive scalability summary with MLUPs statistics for different GPU counts"""
     total_lattice_points = args.cube_edge**3
@@ -423,16 +430,17 @@ def print_scalability_summary(args, stats_list):
 
     print("=" * 95)
 
+
 def report(args, stats):
     import neon
     import sys
 
     report = neon.Report("LBM MLUPS LDC")
-    
+
     # Save the full command line
     command_line = " ".join(sys.argv)
     report.add_member("command_line", command_line)
-    
+
     report.add_member("velocity_set", args.velocity_set.__class__.__name__)
     report.add_member("compute_backend", args.compute_backend.name)
     report.add_member("precision_policy", args.precision)
@@ -469,14 +477,14 @@ def report(args, stats):
     report_name += f"_collision_model_{args.collision_model}"
     report_name += f"_grid_size_{args.cube_edge}"
     report_name += f"_num_steps_{args.num_steps}"
-    
+
     if args.compute_backend.name == "NEON":
         report_name += f"_occ_{args.occ_display}"
         report_name += f"_num_devices_{len(args.gpu_devices)}"
-    
+
     if args.repetitions > 1:
         report_name += f"_repetitions_{args.repetitions}"
-    
+
     report.write(report_name, True)
 
 
diff --git a/examples/performance/mlups_3d_multires.py b/examples/performance/mlups_3d_multires.py
index 5486794c..3a143764 100644
--- a/examples/performance/mlups_3d_multires.py
+++ b/examples/performance/mlups_3d_multires.py
@@ -29,34 +29,36 @@ def parse_arguments():
   velocity_set: D3Q19, D3Q27
   collision_model: BGK, KBC
         """,
-        formatter_class=argparse.RawDescriptionHelpFormatter)
-    
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
     # Positional arguments
     parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid (e.g., 100)")
     parser.add_argument("num_steps", type=int, help="Number of timesteps for the simulation (e.g., 1000)")
     parser.add_argument("compute_backend", type=str, help="Backend for the simulation (neon)")
     parser.add_argument("precision", type=str, help="Precision for the simulation (fp32/fp32, fp64/fp64, fp64/fp32, fp32/fp16)")
     parser.add_argument("num_levels", type=int, help="Number of levels for the multiresolution grid (e.g., 2)")
-    parser.add_argument("mres_perf_opt", type=MresPerfOptimizationType.from_string, help="Multi-resolution performance optimization strategy (NAIVE_COLLIDE_STREAM, FUSION_AT_FINEST)")
+    parser.add_argument(
+        "mres_perf_opt",
+        type=MresPerfOptimizationType.from_string,
+        help="Multi-resolution performance optimization strategy (NAIVE_COLLIDE_STREAM, FUSION_AT_FINEST)",
+    )
 
     # Optional arguments
     parser.add_argument("--num_devices", type=int, default=0, help="Number of devices for the simulation (default: 0)")
-    parser.add_argument("--velocity_set", type=str, default="D3Q19",
-                        help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)")
-    parser.add_argument("--collision_model", type=str, default="BGK",
-                        help="Collision model: BGK or KBC (default: BGK)")
+    parser.add_argument("--velocity_set", type=str, default="D3Q19", help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)")
+    parser.add_argument("--collision_model", type=str, default="BGK", help="Collision model: BGK or KBC (default: BGK)")
 
     parser.add_argument("--report", action="store_true", help="Generate a neon report file (default: disabled)")
-    parser.add_argument("--export_final_velocity", action="store_true",
-                        help="Export the final velocity field to a vti file (default: disabled)")
+    parser.add_argument("--export_final_velocity", action="store_true", help="Export the final velocity field to a vti file (default: disabled)")
 
     try:
         args = parser.parse_args()
     except SystemExit:
         # Re-raise with custom message
-        print("\n" + "="*60)
+        print("\n" + "=" * 60)
         print("USAGE EXAMPLES:")
-        print("="*60)
+        print("=" * 60)
         print("python mlups_3d_multires.py 100 1000 neon fp32/fp32 2 NAIVE_COLLIDE_STREAM")
         print("python mlups_3d_multires.py 200 500 neon fp64/fp64 3 FUSION_AT_FINEST --report")
         print("\nVALID VALUES:")
@@ -65,7 +67,7 @@ def parse_arguments():
         print("  mres_perf_opt: NAIVE_COLLIDE_STREAM, FUSION_AT_FINEST")
         print("  velocity_set: D3Q19, D3Q27")
         print("  collision_model: BGK, KBC")
-        print("="*60)
+        print("=" * 60)
         raise
 
     print_args(args)
@@ -85,7 +87,7 @@ def print_args(args):
     print("           3D LATTICE BOLTZMANN SIMULATION CONFIG")
     print("=" * 60)
     print(f"Grid Size:            {args.cube_edge}³ ({args.cube_edge:,} × {args.cube_edge:,} × {args.cube_edge:,})")
-    print(f"Total Lattice Points: {args.cube_edge ** 3:,}")
+    print(f"Total Lattice Points: {args.cube_edge**3:,}")
     print(f"Time Steps:           {args.num_steps:,}")
     print(f"Number Levels:        {args.num_levels}")
     print(f"Compute Backend:      {args.compute_backend}")
@@ -151,7 +153,7 @@ def peel(dim, idx, peel_level, outwards):
     dim = neon.Index_3d(grid_shape[0], grid_shape[1], grid_shape[2])
 
     def get_peeled_np(level, width):
-        divider = 2 ** level
+        divider = 2**level
         m = neon.Index_3d(dim.x // divider, dim.y // divider, dim.z // divider)
         if level == 0:
             m = dim
@@ -175,7 +177,7 @@ def get_levels(num_levels):
             l = get_peeled_np(i, 8)
             levels.append(l)
         lastLevel = num_levels - 1
-        divider = 2 ** lastLevel
+        divider = 2**lastLevel
         m = neon.Index_3d(dim.x // divider + 1, dim.y // divider + 1, dim.z // divider + 1)
         lastLevel = np.ones((m.x, m.y, m.z), dtype=int)
         lastLevel = np.ascontiguousarray(lastLevel, dtype=np.int32)
@@ -194,8 +196,7 @@ def get_levels(num_levels):
     box = grid.bounding_box_indices()
     box_no_edge = grid.bounding_box_indices(remove_edges=True)
     lid = box_no_edge["top"]
-    walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in
-             range(len(grid.shape))]
+    walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in range(len(grid.shape))]
     walls = np.unique(np.array(walls), axis=-1).tolist()
     # convert bc indices to a list of list, where the first entry of the list corresponds to the finest level
     lid = [lid] + [[] for _ in range(num_levels - 1)]
@@ -208,8 +209,8 @@ def problem2(grid_shape, velocity_set, num_levels):
     level_origins = []
     level_list = []
     for lvl in range(num_levels):
-        divider = 2 ** lvl
-        growth = 1.5 ** lvl
+        divider = 2**lvl
+        growth = 1.5**lvl
         shape = grid_shape[0] // divider, grid_shape[1] // divider, grid_shape[2] // divider
         if lvl == num_levels - 1:
             level = np.ascontiguousarray(np.ones(shape, dtype=int), dtype=np.int32)
@@ -232,8 +233,7 @@ def problem2(grid_shape, velocity_set, num_levels):
     box = grid.bounding_box_indices(shape=grid.level_to_shape(num_levels - 1))
     box_no_edge = grid.bounding_box_indices(shape=grid.level_to_shape(1), remove_edges=True)
     lid = box_no_edge["top"]
-    walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in
-             range(len(grid.shape))]
+    walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in range(len(grid.shape))]
     walls = np.unique(np.array(walls), axis=-1).tolist()
     # convert bc indices to a list of list, where the first entry of the list corresponds to the finest level
     lid = [[] for _ in range(num_levels - 1)] + [lid]
@@ -241,13 +241,15 @@ def problem2(grid_shape, velocity_set, num_levels):
     return grid, lid, walls
 
 
-def run(velocity_set,
-        grid_shape,
-        num_steps,
-        num_levels,
-        collision_model,
-        export_final_velocity,
-        mres_perf_opt, ):
+def run(
+    velocity_set,
+    grid_shape,
+    num_steps,
+    num_levels,
+    collision_model,
+    export_final_velocity,
+    mres_perf_opt,
+):
     # Create grid and setup boundary conditions
 
     # Convert indices to list of indices per level
@@ -275,11 +277,13 @@ def run(velocity_set,
     omega_finest = 1.0 / (3.0 * visc + 0.5)
 
     # Define a multi-resolution simulation manager
-    sim = xlb.helper.MultiresSimulationManager(omega_finest=omega_finest,
-                                               grid=grid,
-                                               boundary_conditions=boundary_conditions,
-                                               collision_type=collision_model,
-                                               mres_perf_opt=mres_perf_opt, )
+    sim = xlb.helper.MultiresSimulationManager(
+        omega_finest=omega_finest,
+        grid=grid,
+        boundary_conditions=boundary_conditions,
+        collision_type=collision_model,
+        mres_perf_opt=mres_perf_opt,
+    )
 
     # sim.export_macroscopic("Initial_")
     # sim.step()
@@ -289,8 +293,8 @@ def run(velocity_set,
     start_time = time.time()
 
     if num_levels == 1:
-        num_steps = num_steps // 2 
-    
+        num_steps = num_steps // 2
+
     for i in range(num_steps):
         sim.step()
         # if i % 1000 == 0:
@@ -310,7 +314,7 @@ def run(velocity_set,
 
 def calculate_mlups(cube_edge, num_steps, elapsed_time, num_levels):
     num_step_finer = num_steps * 2 ** (num_levels - 1)
-    total_lattice_updates = cube_edge ** 3 * num_step_finer
+    total_lattice_updates = cube_edge**3 * num_step_finer
     mlups = (total_lattice_updates / elapsed_time) / 1e6
     return {"EMLUPS": mlups, "finer_steps": num_step_finer}
 
@@ -360,9 +364,9 @@ def main():
     args = parse_arguments()
     velocity_set = setup_simulation(args)
     grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
-    stats = run(velocity_set, grid_shape, args.num_steps, args.num_levels, args.collision_model,
-                args.export_final_velocity,
-                mres_perf_opt = args.mres_perf_opt)
+    stats = run(
+        velocity_set, grid_shape, args.num_steps, args.num_levels, args.collision_model, args.export_final_velocity, mres_perf_opt=args.mres_perf_opt
+    )
     mlups_stats = calculate_mlups(args.cube_edge, args.num_steps, stats["time"], stats["num_levels"])
 
     print(f"Simulation completed in {stats['time']:.2f} seconds")
diff --git a/xlb/helper/simulation_manager.py b/xlb/helper/simulation_manager.py
index 64d403e7..c1c43a10 100644
--- a/xlb/helper/simulation_manager.py
+++ b/xlb/helper/simulation_manager.py
@@ -235,7 +235,7 @@ def recursion_fused_finest_254(level, app):
                     f_1_fd=self.f_1,
                     bc_mask_fd=self.bc_mask,
                     missing_mask_fd=self.missing_mask,
-                    omega=omega
+                    omega=omega,
                 )
                 self.add_to_app(
                     app=app,
@@ -329,7 +329,7 @@ def recursion_fused_finest_254_all(level, app):
                     f_1_fd=self.f_1,
                     bc_mask_fd=self.bc_mask,
                     missing_mask_fd=self.missing_mask,
-                    omega=omega
+                    omega=omega,
                 )
                 self.add_to_app(
                     app=app,
@@ -425,7 +425,7 @@ def recursion_fused_finest_254_all(level, app):
             # wp.synchronize()
             # self.bc_mask.export_vti(f"mask_before.vti", "u")
 
-            self.neon_container['reset_bc_mask_for_no_mr_no_bc_as_254'](0, self.f_0, self.f_1, self.bc_mask, self.bc_mask).run(0)
+            self.neon_container["reset_bc_mask_for_no_mr_no_bc_as_254"](0, self.f_0, self.f_1, self.bc_mask, self.bc_mask).run(0)
             wp.synchronize()
             # self.bc_mask.update_host(0)
             # wp.synchronize()
@@ -438,10 +438,9 @@ def recursion_fused_finest_254_all(level, app):
             # wp.synchronize()
             # self.bc_mask.export_vti(f"mask_before.vti", "u")
 
-            num_levels =  self.f_0.get_grid().num_levels
+            num_levels = self.f_0.get_grid().num_levels
             for l in range(num_levels):
-                self.neon_container['reset_bc_mask_for_no_mr_no_bc_as_254'](l, self.f_0, self.f_1, self.bc_mask,
-                                                                        self.bc_mask).run(0)
+                self.neon_container["reset_bc_mask_for_no_mr_no_bc_as_254"](l, self.f_0, self.f_1, self.bc_mask, self.bc_mask).run(0)
             # wp.synchronize()
             # self.bc_mask.update_host(0)
             wp.synchronize()
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index c4f9db2e..240da307 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -465,7 +465,6 @@ def collide_coarse_254(
             def ll_collide_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
-
                 f_0_pn = loader.get_mres_read_handle(f_0_fd)
                 f_1_pn = loader.get_mres_write_handle(f_1_fd)
 
@@ -698,13 +697,13 @@ def cl_stream_coarse(index: Any):
 
         @neon.Container.factory(name="no_254_stream_coarse_step_ABC")
         def stream_coarse_step_ABC_no_254(
-                level: int,
-                f_0_fd: Any,
-                f_1_fd: Any,
-                bc_mask_fd: Any,
-                missing_mask_fd: Any,
-                omega: Any,
-                timestep: int,
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+            timestep: int,
         ):
             num_levels = f_0_fd.get_grid().num_levels
 
@@ -757,8 +756,7 @@ def cl_stream_coarse(index: Any):
                         pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
 
                         has_ngh_at_same_level = wp.bool(False)
-                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0),
-                                                       has_ngh_at_same_level)
+                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
 
                         # if (!pin.hasChildren(cell, dir)) {
                         if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
@@ -804,8 +802,7 @@ def cl_stream_coarse(index: Any):
                             #     wp.print("ERRRRRRORRRRRRRRRRRRRR")
 
                     # do non mres post-streaming corrections
-                    _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn,
-                                              _f_post_collision, _f_post_stream, True)
+                    _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
 
                     # Apply auxiliary recovery for boundary conditions (swapping) before overwriting f_1
                     neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
@@ -845,7 +842,6 @@ def ll_stream_coarse(loader: neon.Loader):
 
                 _c = self.velocity_set.c
 
-
                 @wp.func
                 def cl_stream_coarse(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
@@ -921,7 +917,6 @@ def cl_stream_coarse(index: Any):
                     # They are not mr halo cells
                     wp.neon_write(bc_mask_pn, index, 0, wp.uint8(254))
 
-
                 loader.declare_kernel(cl_stream_coarse)
 
             return ll_stream_coarse
@@ -979,13 +974,7 @@ def cl_stream_coarse(index: Any):
             return ll_stream_coarse
 
         @neon.Container.factory(name="stream_coarse_step_254")
-        def stream_coarse_step_254(
-                level: int,
-                f_0_fd: Any,
-                f_1_fd: Any,
-                bc_mask_fd: Any,
-                missing_mask_fd: Any
-        ):
+        def stream_coarse_step_254(level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any):
 
             def ll_stream_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
@@ -1243,7 +1232,7 @@ def finest_fused_pull_no_254(
             bc_mask_fd: Any,
             missing_mask_fd: Any,
             omega: Any,
-            timestep:Any,
+            timestep: Any,
             is_f1_the_explosion_src_field: bool,
         ):
             if level != 0:
@@ -1397,7 +1386,6 @@ def finest_fused_pull_kernel_254(index: Any):
                     if _boundary_id != wp.uint8(254):
                         return
 
-
                     # do stream normally
                     _missing_mask = _missing_mask_vec()
                     _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
@@ -1475,20 +1463,17 @@ def cl_stream_coarse(index: Any):
             "finest_fused_pull": finest_fused_pull,
             "finest_fused_pull_no_254": finest_fused_pull_no_254,
             "finest_fused_pull_254": finest_fused_pull_254,
-            "reset_bc_mask_for_no_mr_no_bc_as_254":reset_bc_mask_for_no_mr_no_bc_as_254,
-            "collide_coarse_no_254":collide_coarse_no_254,
-            "collide_coarse_254":collide_coarse_254,
-            "stream_coarse_step_ABC_no_254":stream_coarse_step_ABC_no_254,
-            "stream_coarse_step_254":stream_coarse_step_254,
+            "reset_bc_mask_for_no_mr_no_bc_as_254": reset_bc_mask_for_no_mr_no_bc_as_254,
+            "collide_coarse_no_254": collide_coarse_no_254,
+            "collide_coarse_254": collide_coarse_254,
+            "stream_coarse_step_ABC_no_254": stream_coarse_step_ABC_no_254,
+            "stream_coarse_step_254": stream_coarse_step_254,
         }
 
     def launch_container(self, streamId, op_name, mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep):
         self.neon_container[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep).run(0)
 
-    def add_to_app(
-        self,
-            **kwargs
-    ):
+    def add_to_app(self, **kwargs):
         import inspect
 
         def validate_kwargs_forward(func, kwargs):
@@ -1504,23 +1489,18 @@ def validate_kwargs_forward(func, kwargs):
             errors = {}
 
             # --- 1. Positional-only required params (cannot be given via kwargs) ---
-            pos_only_required = [
-                name for name, p in params.items()
-                if p.kind == inspect.Parameter.POSITIONAL_ONLY
-                   and p.default is inspect._empty
-            ]
+            pos_only_required = [name for name, p in params.items() if p.kind == inspect.Parameter.POSITIONAL_ONLY and p.default is inspect._empty]
             if pos_only_required:
                 errors["positional_only_required"] = pos_only_required
 
             # --- 2. Unexpected kwargs (if no **kwargs in target) ---
-            has_var_kw = any(
-                p.kind == inspect.Parameter.VAR_KEYWORD
-                for p in params.values()
-            )
+            has_var_kw = any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values())
             if not has_var_kw:
                 allowed_kw = {
-                    name for name, p in params.items()
-                    if p.kind in (
+                    name
+                    for name, p in params.items()
+                    if p.kind
+                    in (
                         inspect.Parameter.POSITIONAL_OR_KEYWORD,
                         inspect.Parameter.KEYWORD_ONLY,
                     )
@@ -1531,13 +1511,15 @@ def validate_kwargs_forward(func, kwargs):
 
             # --- 3. Missing required keyword-passable params ---
             missing_required = [
-                name for name, p in params.items()
-                if p.kind in (
+                name
+                for name, p in params.items()
+                if p.kind
+                in (
                     inspect.Parameter.POSITIONAL_OR_KEYWORD,
                     inspect.Parameter.KEYWORD_ONLY,
                 )
-                   and p.default is inspect._empty  # no default
-                   and name not in kwargs  # not provided
+                and p.default is inspect._empty  # no default
+                and name not in kwargs  # not provided
             ]
             if missing_required:
                 errors["missing_required"] = missing_required

From 23390e26dc0c9137c2836eda876c8711b0bb3705 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Mon, 9 Mar 2026 13:54:54 +0100
Subject: [PATCH 195/208] Function renaming.

---
 xlb/helper/simulation_manager.py             |  28 ++--
 xlb/operator/stepper/nse_multires_stepper.py | 160 +++++--------------
 2 files changed, 50 insertions(+), 138 deletions(-)

diff --git a/xlb/helper/simulation_manager.py b/xlb/helper/simulation_manager.py
index c1c43a10..36a9974d 100644
--- a/xlb/helper/simulation_manager.py
+++ b/xlb/helper/simulation_manager.py
@@ -217,7 +217,7 @@ def recursion_fused_finest_254(level, app):
                 print(f"RECURSION Level {level}, Fused STREAM and COLLIDE")
                 self.add_to_app(
                     app=app,
-                    op_name="finest_fused_pull_no_254",
+                    op_name="CFV_finest_fused_pull",
                     level=level,
                     f_0_fd=self.f_0,
                     f_1_fd=self.f_1,
@@ -229,7 +229,7 @@ def recursion_fused_finest_254(level, app):
                 )
                 self.add_to_app(
                     app=app,
-                    op_name="finest_fused_pull_254",
+                    op_name="SFV_finest_fused_pull",
                     level=level,
                     f_0_fd=self.f_0,
                     f_1_fd=self.f_1,
@@ -239,7 +239,7 @@ def recursion_fused_finest_254(level, app):
                 )
                 self.add_to_app(
                     app=app,
-                    op_name="finest_fused_pull_no_254",
+                    op_name="CFV_finest_fused_pull",
                     level=level,
                     f_0_fd=self.f_1,
                     f_1_fd=self.f_0,
@@ -251,7 +251,7 @@ def recursion_fused_finest_254(level, app):
                 )
                 self.add_to_app(
                     app=app,
-                    op_name="finest_fused_pull_254",
+                    op_name="SFV_finest_fused_pull",
                     level=level,
                     f_0_fd=self.f_1,
                     f_1_fd=self.f_0,
@@ -311,7 +311,7 @@ def recursion_fused_finest_254_all(level, app):
                 print(f"RECURSION Level {level}, Fused STREAM and COLLIDE")
                 self.add_to_app(
                     app=app,
-                    op_name="finest_fused_pull_no_254",
+                    op_name="CFV_finest_fused_pull",
                     level=level,
                     f_0_fd=self.f_0,
                     f_1_fd=self.f_1,
@@ -323,7 +323,7 @@ def recursion_fused_finest_254_all(level, app):
                 )
                 self.add_to_app(
                     app=app,
-                    op_name="finest_fused_pull_254",
+                    op_name="SFV_finest_fused_pull",
                     level=level,
                     f_0_fd=self.f_0,
                     f_1_fd=self.f_1,
@@ -333,7 +333,7 @@ def recursion_fused_finest_254_all(level, app):
                 )
                 self.add_to_app(
                     app=app,
-                    op_name="finest_fused_pull_no_254",
+                    op_name="CFV_finest_fused_pull",
                     level=level,
                     f_0_fd=self.f_1,
                     f_1_fd=self.f_0,
@@ -345,7 +345,7 @@ def recursion_fused_finest_254_all(level, app):
                 )
                 self.add_to_app(
                     app=app,
-                    op_name="finest_fused_pull_254",
+                    op_name="SFV_finest_fused_pull",
                     level=level,
                     f_0_fd=self.f_1,
                     f_1_fd=self.f_0,
@@ -360,7 +360,7 @@ def recursion_fused_finest_254_all(level, app):
 
             self.add_to_app(
                 app=app,
-                op_name="collide_coarse_no_254",
+                op_name="CFV_collide_coarse",
                 level=level,
                 f_0_fd=self.f_0,
                 f_1_fd=self.f_1,
@@ -371,7 +371,7 @@ def recursion_fused_finest_254_all(level, app):
             )
             self.add_to_app(
                 app=app,
-                op_name="collide_coarse_254",
+                op_name="SFV_collide_coarse",
                 level=level,
                 f_0_fd=self.f_0,
                 f_1_fd=self.f_1,
@@ -394,7 +394,7 @@ def recursion_fused_finest_254_all(level, app):
             print(f"RECURSION Level {level}, stream_coarse_step_ABC")
             self.add_to_app(
                 app=app,
-                op_name="stream_coarse_step_ABC_no_254",
+                op_name="SFV_stream_coarse_step_ABC",
                 level=level,
                 f_0_fd=self.f_1,
                 f_1_fd=self.f_0,
@@ -405,7 +405,7 @@ def recursion_fused_finest_254_all(level, app):
             )
             self.add_to_app(
                 app=app,
-                op_name="stream_coarse_step_254",
+                op_name="SFV_stream_coarse_step",
                 level=level,
                 f_0_fd=self.f_1,
                 f_1_fd=self.f_0,
@@ -425,7 +425,7 @@ def recursion_fused_finest_254_all(level, app):
             # wp.synchronize()
             # self.bc_mask.export_vti(f"mask_before.vti", "u")
 
-            self.neon_container["reset_bc_mask_for_no_mr_no_bc_as_254"](0, self.f_0, self.f_1, self.bc_mask, self.bc_mask).run(0)
+            self.neon_container["SFV_reset_bc_mask"](0, self.f_0, self.f_1, self.bc_mask, self.bc_mask).run(0)
             wp.synchronize()
             # self.bc_mask.update_host(0)
             # wp.synchronize()
@@ -440,7 +440,7 @@ def recursion_fused_finest_254_all(level, app):
 
             num_levels = self.f_0.get_grid().num_levels
             for l in range(num_levels):
-                self.neon_container["reset_bc_mask_for_no_mr_no_bc_as_254"](l, self.f_0, self.f_1, self.bc_mask, self.bc_mask).run(0)
+                self.neon_container["SFV_reset_bc_mask"](l, self.f_0, self.f_1, self.bc_mask, self.bc_mask).run(0)
             # wp.synchronize()
             # self.bc_mask.update_host(0)
             wp.synchronize()
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 240da307..f2140080 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -26,7 +26,10 @@
 )
 from xlb.operator.boundary_condition.helper_functions_bc import MultiresEncodeAuxiliaryData
 
-
+"""
+SFV = Simple Fluid Voxel: a fluid voxel that is not a BC nor is involved in explosion or coalescence
+CFV = Complex Fluid Voxel: a fluid voxel that is not a SFV
+"""
 class MultiresIncompressibleNavierStokesStepper(Stepper):
     def __init__(
         self,
@@ -147,15 +150,6 @@ def loading(loader: neon.Loader):
 
                 @wp.func
                 def compute(index: Any):
-                    # _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    # if _boundary_id == wp.uint8(255):
-                    #     return
-                    # for l in range(self.velocity_set.q):
-                    #     val = wp.neon_read(coalescence_factor_pn, index, l)
-                    #     if val > 0:
-                    #         val = self.compute_dtype(1) / val
-                    #     wp.neon_write(coalescence_factor_pn, index, l, val)
-                    #####
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id == wp.uint8(255):
                         return
@@ -184,13 +178,6 @@ def compute(index: Any):
                             # HERE -> I have a finer ngh. in direction pull (opposite l)
                             # Then I have to read from the halo on top of my finer ngh.
                             if has_ngh_at_same_level:
-                                # if l == 10:
-                                #     wp.print(accumulated)
-                                #     glob = wp.neon_global_idx(f_1_pn, index)
-                                #     wp.neon_cuda_info()
-                                #     wp.neon_print(glob)
-                                #     wp.neon_level(f_1_pn)
-                                # accumulated = _w[l]
                                 # Full State
                                 # YES finer ngh. in the pull direction (opposite of l)
                                 # YES ngh. at the same level
@@ -198,9 +185,6 @@ def compute(index: Any):
                                 if coalescence_factor > self.compute_dtype(0):
                                     coalescence_factor = self.compute_dtype(1) / (self.compute_dtype(2) * coalescence_factor)
                                     wp.neon_write(coalescence_factor_pn, index, l, coalescence_factor)
-                            #
-                            # else:
-                            #     wp.print("ERRRRRRORRRRRRRRRRRRRR")
 
                 loader.declare_kernel(compute)
 
@@ -450,8 +434,8 @@ def device(index: Any):
 
             return ll_collide_coarse
 
-        @neon.Container.factory(name="collide_coarse_254")
-        def collide_coarse_254(
+        @neon.Container.factory(name="SFV_collide_coarse")
+        def SFV_collide_coarse(
             level: int,
             f_0_fd: Any,
             f_1_fd: Any,
@@ -460,16 +444,15 @@ def collide_coarse_254(
             omega: Any,
             timestep: int,
         ):
-            num_levels = f_0_fd.get_grid().num_levels
-
+            """
+            This container will execute the collision operator only on the SFV at the coarsest level.
+            """
             def ll_collide_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
                 f_0_pn = loader.get_mres_read_handle(f_0_fd)
                 f_1_pn = loader.get_mres_write_handle(f_1_fd)
 
-                # fake loading to enforce sequential step
-
                 bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
                 missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
 
@@ -479,11 +462,6 @@ def ll_collide_coarse(loader: neon.Loader):
                 @wp.func
                 def device(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    """
-                    The c++ version starts with the following, which I am not sure is right:
-                        if (type(cell, 0) == CellType::bulk ) {
-                    BC type cells should do collide too
-                    """
                     if _boundary_id != wp.uint8(254):
                         return
 
@@ -502,8 +480,8 @@ def device(index: Any):
 
             return ll_collide_coarse
 
-        @neon.Container.factory(name="no_254_collide_coarse")
-        def collide_coarse_no_254(
+        @neon.Container.factory(name="CFV_collide_coarse")
+        def CFV_collide_coarse(
             level: int,
             f_0_fd: Any,
             f_1_fd: Any,
@@ -695,8 +673,8 @@ def cl_stream_coarse(index: Any):
 
             return ll_stream_coarse
 
-        @neon.Container.factory(name="no_254_stream_coarse_step_ABC")
-        def stream_coarse_step_ABC_no_254(
+        @neon.Container.factory(name="SFV_stream_coarse_step_ABC")
+        def SFV_stream_coarse_step_ABC(
             level: int,
             f_0_fd: Any,
             f_1_fd: Any,
@@ -705,15 +683,6 @@ def stream_coarse_step_ABC_no_254(
             omega: Any,
             timestep: int,
         ):
-            num_levels = f_0_fd.get_grid().num_levels
-
-            # if level != 0:
-            #     # throw an exception
-            #     raise Exception("Only the finest level is supported for now")
-
-            # module op to define odd of even iteration
-            # od_or_even = wp.module("odd_or_even", "even")
-
             def ll_stream_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
@@ -814,28 +783,21 @@ def cl_stream_coarse(index: Any):
 
             return ll_stream_coarse
 
-        @neon.Container.factory(name="reset_bc_mask_for_no_mr_no_bc_as_254")
-        def reset_bc_mask_for_no_mr_no_bc_as_254(
+        @neon.Container.factory(name="SFV_reset_bc_mask")
+        def SFV_reset_bc_mask(
             level: int,
             f_0_fd: Any,
             f_1_fd: Any,
             bc_mask_fd: Any,
             missing_mask_fd: Any,
         ):
-            num_levels = f_0_fd.get_grid().num_levels
-
-            # if level != 0:
-            #     # throw an exception
-            #     raise Exception("Only the finest level is supported for now")
-
-            # module op to define odd of even iteration
-            # od_or_even = wp.module("odd_or_even", "even")
-
+            """
+            Setting the BC type to 254 for SFVs
+            """
             def ll_stream_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
                 f_0_pn = loader.get_mres_read_handle(f_0_fd)
-                f_1_pn = loader.get_mres_write_handle(f_1_fd)
 
                 bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
                 missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
@@ -869,7 +831,7 @@ def cl_stream_coarse(index: Any):
                         pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
 
                         has_ngh_at_same_level = wp.bool(False)
-                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
+                        wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
 
                         # if (!pin.hasChildren(cell, dir)) {
                         if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
@@ -880,7 +842,7 @@ def cl_stream_coarse(index: Any):
                                 if wp.neon_has_parent(f_0_pn, index):
                                     # YES halo cell on top of us
                                     has_a_coarser_ngh = wp.bool(False)
-                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(
+                                    wp.neon_lbm_read_coarser_ngh(
                                         f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
                                     )
                                     if has_a_coarser_ngh:
@@ -890,33 +852,22 @@ def cl_stream_coarse(index: Any):
                                         # YES ghost cell on top of us
                                         # YES courser ngh.
                                         # -> **Explosion**
-                                        # wp.neon_write(f_1_pn, index, l, exploded_pop)
                                         return
                         else:
                             # HERE -> I have a finer ngh. in direction pull (opposite l)
                             # Then I have to read from the halo on top of my finer ngh.
                             if has_ngh_at_same_level:
-                                # if l == 10:
-                                #     wp.print(accumulated)
-                                #     glob = wp.neon_global_idx(f_1_pn, index)
-                                #     wp.neon_cuda_info()
-                                #     wp.neon_print(glob)
-                                #     wp.neon_level(f_1_pn)
-                                # accumulated = _w[l]
                                 # Full State
                                 # YES finer ngh. in the pull direction (opposite of l)
                                 # YES ngh. at the same level
                                 # -> **Coalescence**
                                 return
-                            else:
-                                wp.print("ERRRRRRORRRRRRRRRRRRRR")
 
                     # Only fluid voxels with the following properties can reach this line:
                     # They are not BC voxels
                     # They are not on a resolution jump -> they do not do coalescence or explosion
                     # They are not mr halo cells
                     wp.neon_write(bc_mask_pn, index, 0, wp.uint8(254))
-
                 loader.declare_kernel(cl_stream_coarse)
 
             return ll_stream_coarse
@@ -931,11 +882,6 @@ def stream_coarse_step_A(
             omega: Any,
             timestep: int,
         ):
-            num_levels = f_0_fd.get_grid().get_num_levels()
-
-            # if level != 0:
-            #     # throw an exception
-            #     raise Exception("Only the finest level is supported for now")
 
             def ll_stream_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
@@ -973,8 +919,8 @@ def cl_stream_coarse(index: Any):
 
             return ll_stream_coarse
 
-        @neon.Container.factory(name="stream_coarse_step_254")
-        def stream_coarse_step_254(level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any):
+        @neon.Container.factory(name="SFV_stream_coarse_step")
+        def SFV_stream_coarse_step(level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any):
 
             def ll_stream_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
@@ -992,13 +938,11 @@ def cl_stream_coarse(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id != wp.uint8(254):
                         return
+                    # 254 voxel type:
+                    # They are not BC voxels
+                    # They are not on a resolution jump -> they do not do coalescence or explosion
+                    # They are not mr halo cells
 
-                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
-                    if are_we_a_halo_cell:
-                        # HERE: we are a halo cell so we just exit
-                        return
-
-                    # do stream normally
                     _missing_mask = _missing_mask_vec()
                     _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
                     _f_post_collision = _f0_thread
@@ -1006,7 +950,6 @@ def cl_stream_coarse(index: Any):
 
                     for l in range(self.velocity_set.q):
                         wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
-                    # wp.print("stream_coarse")
 
                 loader.declare_kernel(cl_stream_coarse)
 
@@ -1079,13 +1022,6 @@ def cl_stream_coarse(index: Any):
                             # HERE -> I have a finer ngh. in direction pull (opposite l)
                             # Then I have to read from the halo on top of my finer ngh.
                             if has_ngh_at_same_level:
-                                # if l == 10:
-                                #     wp.print(accumulated)
-                                #     glob = wp.neon_global_idx(f_1_pn, index)
-                                #     wp.neon_cuda_info()
-                                #     wp.neon_print(glob)
-                                #     wp.neon_level(f_1_pn)
-                                # accumulated = _w[l]
                                 # Full State
                                 # YES finer ngh. in the pull direction (opposite of l)
                                 # YES ngh. at the same level
@@ -1093,9 +1029,6 @@ def cl_stream_coarse(index: Any):
                                 coalescence_factor = wp.neon_read(coalescence_factor_pn, index, l)
                                 accumulated = accumulated * coalescence_factor
                                 wp.neon_write(f_1_pn, index, l, accumulated)
-                            #
-                            # else:
-                            #     wp.print("ERRRRRRORRRRRRRRRRRRRR")
 
                 loader.declare_kernel(cl_stream_coarse)
 
@@ -1118,13 +1051,6 @@ def finest_fused_pull(
             grid = f_0_fd.get_grid()
             num_levels = grid.num_levels
 
-            # if level != 0:
-            #     # throw an exception
-            #     raise Exception("Only the finest level is supported for now")
-
-            # module op to define odd of even iteration
-            # od_or_even = wp.module("odd_or_even", "even")
-
             def finest_fused_pull_launcher(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
@@ -1224,8 +1150,8 @@ def finest_fused_pull_kernel(index: Any):
 
             return finest_fused_pull_launcher
 
-        @neon.Container.factory(name="finest_fused_pull_no_254")
-        def finest_fused_pull_no_254(
+        @neon.Container.factory(name="CFV_finest_fused_pull")
+        def CFV_finest_fused_pull(
             level: int,
             f_0_fd: Any,
             f_1_fd: Any,
@@ -1241,13 +1167,6 @@ def finest_fused_pull_no_254(
             grid = f_0_fd.get_grid()
             num_levels = grid.num_levels
 
-            # if level != 0:
-            #     # throw an exception
-            #     raise Exception("Only the finest level is supported for now")
-
-            # module op to define odd of even iteration
-            # od_or_even = wp.module("odd_or_even", "even")
-
             def finest_fused_pull_launcher(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
@@ -1315,7 +1234,6 @@ def finest_fused_pull_kernel(index: Any):
                                     # YES ghost cell on top of us
                                     # YES courser ngh.
                                     # -> **Explosion**
-                                    # wp.neon_write(f_1_pn, index, l, exploded_pop)
                                     _f_post_stream[l] = exploded_pop
 
                     # do non mres post-streaming corrections
@@ -1350,7 +1268,7 @@ def finest_fused_pull_kernel(index: Any):
             return finest_fused_pull_launcher
 
         @neon.Container.factory(name="254_finest_fused_pull")
-        def finest_fused_pull_254(
+        def SFV_finest_fused_pull(
             level: int,
             f_0_fd: Any,
             f_1_fd: Any,
@@ -1358,12 +1276,6 @@ def finest_fused_pull_254(
             missing_mask_fd: Any,
             omega: Any,
         ):
-            if level != 0:
-                # throw an exception
-                raise Exception("Only the finest level is supported for now")
-            grid = f_0_fd.get_grid()
-            num_levels = grid.num_levels
-
             if level != 0:
                 # throw an exception
                 raise Exception("Only the finest level is supported for now")
@@ -1461,13 +1373,13 @@ def cl_stream_coarse(index: Any):
             "stream_coarse_step_B": stream_coarse_step_B,
             "stream_coarse_step_C": stream_coarse_step_C,
             "finest_fused_pull": finest_fused_pull,
-            "finest_fused_pull_no_254": finest_fused_pull_no_254,
-            "finest_fused_pull_254": finest_fused_pull_254,
-            "reset_bc_mask_for_no_mr_no_bc_as_254": reset_bc_mask_for_no_mr_no_bc_as_254,
-            "collide_coarse_no_254": collide_coarse_no_254,
-            "collide_coarse_254": collide_coarse_254,
-            "stream_coarse_step_ABC_no_254": stream_coarse_step_ABC_no_254,
-            "stream_coarse_step_254": stream_coarse_step_254,
+            "CFV_finest_fused_pull": CFV_finest_fused_pull,
+            "SFV_finest_fused_pull": SFV_finest_fused_pull,
+            "SFV_reset_bc_mask": SFV_reset_bc_mask,
+            "CFV_collide_coarse": CFV_collide_coarse,
+            "SFV_collide_coarse": SFV_collide_coarse,
+            "SFV_stream_coarse_step_ABC": SFV_stream_coarse_step_ABC,
+            "SFV_stream_coarse_step": SFV_stream_coarse_step,
         }
 
     def launch_container(self, streamId, op_name, mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep):

From fe4c1d2778c898eea43315be41d5e141bb9d2c6c Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 9 Mar 2026 12:26:09 -0400
Subject: [PATCH 196/208] Fixed a bug left from previous merge PR

---
 xlb/operator/boundary_masker/indices_boundary_masker.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 4255bf67..343cc22d 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -284,7 +284,7 @@ def kernel_interior_missing_mask(
         }
         return functional_dict, kernel_dict
 
-    def _prepare_kernel_inputs(self, bclist, grid_shape):
+    def _prepare_kernel_inputs(self, bclist, grid_shape, start_index=None):
         """
         Prepare the inputs for the warp kernel by pre-allocating arrays and filling them with boundary condition information.
         """
@@ -371,7 +371,7 @@ def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
         bc_interior = self._find_bclist_interior(bclist, grid_shape)
 
         # Prepare the first kernel inputs for all items in boundary condition list
-        wp_bc_indices, wp_id_numbers, wp_is_interior = self._prepare_kernel_inputs(bclist, grid_shape)
+        wp_bc_indices, wp_id_numbers, wp_is_interior = self._prepare_kernel_inputs(bclist, grid_shape, start_index)
 
         # Launch the warp kernel
         wp.launch(
@@ -523,7 +523,7 @@ def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
         bc_interior = self._find_bclist_interior(bclist, grid_shape)
 
         # Prepare the first kernel inputs for all items in boundary condition list
-        wp_bc_indices, wp_id_numbers, wp_is_interior = self._prepare_kernel_inputs(bclist, grid_shape)
+        wp_bc_indices, wp_id_numbers, wp_is_interior = self._prepare_kernel_inputs(bclist, grid_shape, start_index)
 
         # Launch the first container
         container_domain_bounds = self.neon_container["container_domain_bounds"](

From 2f66dc6a716c4b086e7e93b427f3669a8989c630 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 9 Mar 2026 15:26:10 -0400
Subject: [PATCH 197/208] The second moment computation includes "rho" embedded
 in its output. So no need to further multiply by rho in KBC.

---
 xlb/operator/collision/kbc.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/xlb/operator/collision/kbc.py b/xlb/operator/collision/kbc.py
index c6528a3d..a782d690 100644
--- a/xlb/operator/collision/kbc.py
+++ b/xlb/operator/collision/kbc.py
@@ -64,10 +64,10 @@ def jax_implementation(
         fneq = f - feq
         if isinstance(self.velocity_set, D2Q9):
             shear = self.decompose_shear_d2q9_jax(fneq)
-            delta_s = shear * rho / 4.0
+            delta_s = shear / 4.0
         elif isinstance(self.velocity_set, D3Q27):
             shear = self.decompose_shear_d3q27_jax(fneq)
-            delta_s = shear * rho
+            delta_s = shear
         else:
             raise NotImplementedError("Velocity set not supported: {}".format(type(self.velocity_set)))
 
@@ -277,10 +277,10 @@ def functional(
             fneq = f - feq
             if wp.static(self.velocity_set.d == 3):
                 shear = decompose_shear_d3q27(fneq)
-                delta_s = shear * rho
+                delta_s = shear
             else:
                 shear = decompose_shear_d2q9(fneq)
-                delta_s = shear * rho / self.compute_dtype(4.0)
+                delta_s = shear / self.compute_dtype(4.0)
 
             # Compute required constants based on the input omega (omega is the inverse relaxation time)
             _beta = self.compute_dtype(0.5) * self.compute_dtype(omega)

From 3e55bbd618f92d909884b5a3978bfac8ae69f2fb Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 9 Mar 2026 15:50:04 -0400
Subject: [PATCH 198/208] Removed unused parameters (rho, u) from various
 collision methods, simplifying the function signatures across multiple
 classes.

---
 .../collision/test_bgk_collision_jax.py       |  2 +-
 .../collision/test_bgk_collision_warp.py      |  2 +-
 xlb/operator/collision/bgk.py                 | 12 ++++------
 xlb/operator/collision/forced_collision.py    | 24 +++++++------------
 xlb/operator/collision/kbc.py                 | 20 ++--------------
 xlb/operator/collision/smagorinsky_les_bgk.py | 16 +++----------
 xlb/operator/stepper/nse_multires_stepper.py  | 12 +++++-----
 xlb/operator/stepper/nse_stepper.py           |  8 +++----
 8 files changed, 29 insertions(+), 67 deletions(-)

diff --git a/tests/kernels/collision/test_bgk_collision_jax.py b/tests/kernels/collision/test_bgk_collision_jax.py
index f3f4308f..91143932 100644
--- a/tests/kernels/collision/test_bgk_collision_jax.py
+++ b/tests/kernels/collision/test_bgk_collision_jax.py
@@ -45,7 +45,7 @@ def test_bgk_ollision(dim, velocity_set, grid_shape, omega):
 
     f_orig = my_grid.create_field(cardinality=DefaultConfig.velocity_set.q)
 
-    f_out = compute_collision(f_orig, f_eq, rho, u, omega)
+    f_out = compute_collision(f_orig, f_eq, omega)
 
     assert jnp.allclose(f_out, f_orig - omega * (f_orig - f_eq))
 
diff --git a/tests/kernels/collision/test_bgk_collision_warp.py b/tests/kernels/collision/test_bgk_collision_warp.py
index aa51ea1d..fa6884b2 100644
--- a/tests/kernels/collision/test_bgk_collision_warp.py
+++ b/tests/kernels/collision/test_bgk_collision_warp.py
@@ -44,7 +44,7 @@ def test_bgk_collision_warp(dim, velocity_set, grid_shape, omega):
     f_orig = my_grid.create_field(cardinality=DefaultConfig.velocity_set.q)
 
     f_out = my_grid.create_field(cardinality=DefaultConfig.velocity_set.q)
-    f_out = compute_collision(f_orig, f_eq, f_out, rho, u, omega)
+    f_out = compute_collision(f_orig, f_eq, f_out, omega)
 
     f_eq = f_eq.numpy()
     f_out = f_out.numpy()
diff --git a/xlb/operator/collision/bgk.py b/xlb/operator/collision/bgk.py
index 65d47598..d9a7fb95 100644
--- a/xlb/operator/collision/bgk.py
+++ b/xlb/operator/collision/bgk.py
@@ -16,7 +16,7 @@ class BGK(Collision):
 
     @Operator.register_backend(ComputeBackend.JAX)
     @partial(jit, static_argnums=(0,))
-    def jax_implementation(self, f: jnp.ndarray, feq: jnp.ndarray, rho, u, omega):
+    def jax_implementation(self, f: jnp.ndarray, feq: jnp.ndarray, omega):
         fneq = f - feq
         fout = f - self.compute_dtype(omega) * fneq
         return fout
@@ -28,7 +28,7 @@ def _construct_warp(self):
 
         # Construct the functional
         @wp.func
-        def functional(f: Any, feq: Any, rho: Any, u: Any, omega: Any):
+        def functional(f: Any, feq: Any, omega: Any):
             fneq = f - feq
             fout = f - self.compute_dtype(omega) * fneq
             return fout
@@ -39,8 +39,6 @@ def kernel(
             f: wp.array4d(dtype=Any),
             feq: wp.array4d(dtype=Any),
             fout: wp.array4d(dtype=Any),
-            rho: wp.array4d(dtype=Any),
-            u: wp.array4d(dtype=Any),
             omega: Any,
         ):
             # Get the global index
@@ -55,7 +53,7 @@ def kernel(
                 _feq[l] = feq[l, index[0], index[1], index[2]]
 
             # Compute the collision
-            _fout = functional(_f, _feq, rho, u, omega)
+            _fout = functional(_f, _feq, omega)
 
             # Write the result
             for l in range(self.velocity_set.q):
@@ -68,7 +66,7 @@ def _construct_neon(self):
         return functional, None
 
     @Operator.register_backend(ComputeBackend.WARP)
-    def warp_implementation(self, f, feq, fout, rho, u, omega):
+    def warp_implementation(self, f, feq, fout, omega):
         # Launch the warp kernel
         wp.launch(
             self.warp_kernel,
@@ -76,8 +74,6 @@ def warp_implementation(self, f, feq, fout, rho, u, omega):
                 f,
                 feq,
                 fout,
-                rho,
-                u,
                 omega,
             ],
             dim=f.shape[1:],
diff --git a/xlb/operator/collision/forced_collision.py b/xlb/operator/collision/forced_collision.py
index 80c9b0b2..4d97f2c4 100644
--- a/xlb/operator/collision/forced_collision.py
+++ b/xlb/operator/collision/forced_collision.py
@@ -33,9 +33,9 @@ def __init__(
 
     @Operator.register_backend(ComputeBackend.JAX)
     @partial(jit, static_argnums=(0,))
-    def jax_implementation(self, f: jnp.ndarray, feq: jnp.ndarray, rho, u, omega):
-        fout = self.collision_operator(f, feq, rho, u, omega)
-        fout = self.forcing_operator(fout, feq, rho, u)
+    def jax_implementation(self, f: jnp.ndarray, feq: jnp.ndarray, omega):
+        fout = self.collision_operator(f, feq, omega)
+        fout = self.forcing_operator(fout, feq)
         return fout
 
     def _construct_warp(self):
@@ -45,9 +45,9 @@ def _construct_warp(self):
 
         # Construct the functional
         @wp.func
-        def functional(f: Any, feq: Any, rho: Any, u: Any, omega: Any):
-            fout = self.collision_operator.warp_functional(f, feq, rho, u, omega)
-            fout = self.forcing_operator.warp_functional(fout, feq, rho, u)
+        def functional(f: Any, feq: Any, omega: Any):
+            fout = self.collision_operator.warp_functional(f, feq, omega)
+            fout = self.forcing_operator.warp_functional(fout, feq)
             return fout
 
         # Construct the warp kernel
@@ -56,8 +56,6 @@ def kernel(
             f: wp.array4d(dtype=Any),
             feq: wp.array4d(dtype=Any),
             fout: wp.array4d(dtype=Any),
-            rho: wp.array4d(dtype=Any),
-            u: wp.array4d(dtype=Any),
             omega: Any,
         ):
             # Get the global index
@@ -71,13 +69,9 @@ def kernel(
             for l in range(self.velocity_set.q):
                 _f[l] = f[l, index[0], index[1], index[2]]
                 _feq[l] = feq[l, index[0], index[1], index[2]]
-            _u = _u_vec()
-            for l in range(_d):
-                _u[l] = u[l, index[0], index[1], index[2]]
-            _rho = rho[0, index[0], index[1], index[2]]
 
             # Compute the collision
-            _fout = functional(_f, _feq, _rho, _u, omega)
+            _fout = functional(_f, _feq, omega)
 
             # Write the result
             for l in range(self.velocity_set.q):
@@ -86,7 +80,7 @@ def kernel(
         return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
-    def warp_implementation(self, f, feq, fout, rho, u, omega):
+    def warp_implementation(self, f, feq, fout, omega):
         # Launch the warp kernel
         wp.launch(
             self.warp_kernel,
@@ -94,8 +88,6 @@ def warp_implementation(self, f, feq, fout, rho, u, omega):
                 f,
                 feq,
                 fout,
-                rho,
-                u,
                 omega,
             ],
             dim=f.shape[1:],
diff --git a/xlb/operator/collision/kbc.py b/xlb/operator/collision/kbc.py
index a782d690..d814a7cf 100644
--- a/xlb/operator/collision/kbc.py
+++ b/xlb/operator/collision/kbc.py
@@ -43,8 +43,6 @@ def jax_implementation(
         self,
         f: jnp.ndarray,
         feq: jnp.ndarray,
-        rho: jnp.ndarray,
-        u: jnp.ndarray,
         omega,
     ):
         """
@@ -56,10 +54,6 @@ def jax_implementation(
             Distribution function.
         feq : jax.numpy.array
             Equilibrium distribution function.
-        rho : jax.numpy.array
-            Density.
-        u : jax.numpy.array
-            Velocity.
         """
         fneq = f - feq
         if isinstance(self.velocity_set, D2Q9):
@@ -269,8 +263,6 @@ def compute_entropic_scalar_products(
         def functional(
             f: Any,
             feq: Any,
-            rho: Any,
-            u: Any,
             omega: Any,
         ):
             # Compute shear and delta_s
@@ -301,8 +293,6 @@ def kernel(
             f: wp.array4d(dtype=Any),
             feq: wp.array4d(dtype=Any),
             fout: wp.array4d(dtype=Any),
-            rho: wp.array4d(dtype=Any),
-            u: wp.array4d(dtype=Any),
             omega: Any,
         ):
             # Get the global index
@@ -316,13 +306,9 @@ def kernel(
             for l in range(self.velocity_set.q):
                 _f[l] = f[l, index[0], index[1], index[2]]
                 _feq[l] = feq[l, index[0], index[1], index[2]]
-            _u = _u_vec()
-            for l in range(_d):
-                _u[l] = u[l, index[0], index[1], index[2]]
-            _rho = rho[0, index[0], index[1], index[2]]
 
             # Compute the collision
-            _fout = functional(_f, _feq, _rho, _u, omega)
+            _fout = functional(_f, _feq, omega)
 
             # Write the result
             for l in range(self.velocity_set.q):
@@ -338,7 +324,7 @@ def _construct_neon(self):
         return functional, None
 
     @Operator.register_backend(ComputeBackend.WARP)
-    def warp_implementation(self, f, feq, fout, rho, u, omega):
+    def warp_implementation(self, f, feq, fout, omega):
         # Launch the warp kernel
         wp.launch(
             self.warp_kernel,
@@ -346,8 +332,6 @@ def warp_implementation(self, f, feq, fout, rho, u, omega):
                 f,
                 feq,
                 fout,
-                rho,
-                u,
                 omega,
             ],
             dim=f.shape[1:],
diff --git a/xlb/operator/collision/smagorinsky_les_bgk.py b/xlb/operator/collision/smagorinsky_les_bgk.py
index b56aeeba..4dbd001e 100644
--- a/xlb/operator/collision/smagorinsky_les_bgk.py
+++ b/xlb/operator/collision/smagorinsky_les_bgk.py
@@ -28,7 +28,7 @@ def __init__(
 
     @Operator.register_backend(ComputeBackend.JAX)
     @partial(jit, static_argnums=(0,))
-    def jax_implementation(self, f: jnp.ndarray, feq: jnp.ndarray, rho: jnp.ndarray, u: jnp.ndarray, omega):
+    def jax_implementation(self, f: jnp.ndarray, feq: jnp.ndarray, omega):
         fneq = f - feq
 
         pi_neq = jnp.tensordot(self.velocity_set.cc, fneq, axes=(0, 0))
@@ -65,8 +65,6 @@ def _construct_warp(self):
         def functional(
             f: Any,
             feq: Any,
-            rho: Any,
-            u: Any,
             omega: Any,
         ):
             # Compute the non-equilibrium distribution
@@ -127,8 +125,6 @@ def functional(
         def kernel(
             f: wp.array4d(dtype=Any),
             feq: wp.array4d(dtype=Any),
-            rho: wp.array4d(dtype=Any),
-            u: wp.array4d(dtype=Any),
             fout: wp.array4d(dtype=Any),
             omega: wp.float32,
         ):
@@ -142,13 +138,9 @@ def kernel(
             for l in range(self.velocity_set.q):
                 _f[l] = f[l, index[0], index[1], index[2]]
                 _feq[l] = feq[l, index[0], index[1], index[2]]
-            _u = _u_vec()
-            for l in range(_d):
-                _u[l] = u[l, index[0], index[1], index[2]]
-            _rho = rho[0, index[0], index[1], index[2]]
 
             # Compute the collision
-            _fout = functional(_f, _feq, _rho, _u, omega)
+            _fout = functional(_f, _feq, omega)
 
             # Write the result
             for l in range(self.velocity_set.q):
@@ -157,15 +149,13 @@ def kernel(
         return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
-    def warp_implementation(self, f, feq, rho, u, fout, omega):
+    def warp_implementation(self, f, feq, fout, omega):
         # Launch the warp kernel
         wp.launch(
             self.warp_kernel,
             inputs=[
                 f,
                 feq,
-                rho,
-                u,
                 fout,
                 omega,
             ],
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index b54c140f..f2e9490e 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -408,7 +408,7 @@ def device(index: Any):
 
                         _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
                         _feq = self.equilibrium.neon_functional(_rho, _u)
-                        _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
+                        _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, omega)
 
                         # Apply post-collision boundary conditions
                         _f_post_collision = apply_bc(
@@ -474,7 +474,7 @@ def device(index: Any):
 
                     _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
                     _feq = self.equilibrium.neon_functional(_rho, _u)
-                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
+                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, omega)
 
                     for l in range(self.velocity_set.q):
                         wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
@@ -532,7 +532,7 @@ def device(index: Any):
 
                         _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
                         _feq = self.equilibrium.neon_functional(_rho, _u)
-                        _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
+                        _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, omega)
 
                         # Apply post-collision boundary conditions
                         _f_post_collision = apply_bc(
@@ -1125,7 +1125,7 @@ def finest_fused_pull_kernel(index: Any):
 
                     _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
                     _feq = self.equilibrium.neon_functional(_rho, _u)
-                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
+                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, omega)
 
                     # Apply post-collision boundary conditions
                     _f_post_collision = apply_bc(
@@ -1242,7 +1242,7 @@ def finest_fused_pull_kernel(index: Any):
 
                     _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
                     _feq = self.equilibrium.neon_functional(_rho, _u)
-                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
+                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, omega)
 
                     # Apply post-collision boundary conditions
                     _f_post_collision = apply_bc(
@@ -1307,7 +1307,7 @@ def finest_fused_pull_kernel_254(index: Any):
 
                     _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
                     _feq = self.equilibrium.neon_functional(_rho, _u)
-                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
+                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, omega)
 
                     for l in range(self.velocity_set.q):
                         wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 3e7dbd26..f2685ba4 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -234,7 +234,7 @@ def jax_implementation_pull(self, f_0, f_1, bc_mask, missing_mask, omega, timest
         feq = self.equilibrium(rho, u)
 
         # Apply collision
-        f_post_collision = self.collision(f_post_stream, feq, rho, u, omega)
+        f_post_collision = self.collision(f_post_stream, feq, omega)
 
         # Apply collision type boundary conditions
         for bc in self.boundary_conditions:
@@ -271,7 +271,7 @@ def jax_implementation_push(self, f_0, f_1, bc_mask, missing_mask, omega, timest
         feq = self.equilibrium(rho, u)
 
         # Apply collision
-        f_post_collision = self.collision(f_post_stream, feq, rho, u, omega)
+        f_post_collision = self.collision(f_post_stream, feq, omega)
 
         # Apply collision type boundary conditions
         for bc in self.boundary_conditions:
@@ -422,7 +422,7 @@ def kernel(
 
             _rho, _u = self.macroscopic.warp_functional(_f_post_stream)
             _feq = self.equilibrium.warp_functional(_rho, _u)
-            _f_post_collision = self.collision.warp_functional(_f_post_stream, _feq, _rho, _u, omega)
+            _f_post_collision = self.collision.warp_functional(_f_post_stream, _feq, omega)
 
             # Apply post-collision boundary conditions
             _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0, f_1, _f_post_stream, _f_post_collision, False)
@@ -578,7 +578,7 @@ def nse_stepper_cl(index: Any):
 
                     _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
                     _feq = self.equilibrium.neon_functional(_rho, _u)
-                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
+                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, omega)
 
                     # Apply post-collision boundary conditions
                     _f_post_collision = apply_bc(

From e668765618145ed9bb884390dfe1713bbcd7d542 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Mon, 9 Mar 2026 15:57:08 -0400
Subject: [PATCH 199/208] Updared wall BC in this example to ensure numerical
 stability

---
 examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
index e70f30ca..baaa72e3 100644
--- a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
+++ b/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
@@ -186,7 +186,7 @@ def bc_profile_warp(index: wp.vec3i):
 # bc_left = HybridBC(bc_method="bounceback_regularized", profile=bc_profile(), indices=inlet)
 # Alternatively, use a prescribed velocity profile
 # bc_left = RegularizedBC("velocity", prescribed_value=(u_max, 0.0, 0.0), indices=inlet)
-bc_walls = HalfwayBounceBackBC(indices=walls)
+bc_walls = FullwayBounceBackBC(indices=walls)
 # bc_ground = FullwayBounceBackBC(indices=grid.boundary_indices_across_levels(level_data, box_side="front"))
 # bc_outlet = ExtrapolationOutflowBC(indices=outlet)
 bc_outlet = DoNothingBC(indices=outlet)

From df3d5d63b3d90987c8e1a5b30950539558edadbe Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hsalehipour@gmail.com>
Date: Tue, 10 Mar 2026 12:30:16 -0400
Subject: [PATCH 200/208] Clean up (#34)

* Fixed some runtime bugs

* fixed some naming/spelling errors

* removed some debugging comments.

* Introduced a new file `cell_type.py` containing boundary-mask constants for fluid voxelss to replace hardcoded values with the new constants.

* Applied renaming of 254 to SFV to function names
---
 examples/cfd/grid_refinement/ahmed.py         | 1132 ++++++++---------
 examples/cfd/rotating_sphere_3d.py            |  634 ++++-----
 .../collision/test_bgk_collision_jax.py       |    2 +-
 xlb/cell_type.py                              |   11 +
 xlb/helper/simulation_manager.py              |   97 +-
 xlb/mres_perf_optimization_type.py            |    4 +-
 .../boundary_condition/bc_do_nothing.py       |    2 +-
 .../bc_extrapolation_outflow.py               |    2 +-
 .../bc_halfway_bounce_back.py                 |    2 +-
 xlb/operator/boundary_condition/bc_hybrid.py  |    4 +-
 xlb/operator/boundary_condition/bc_zouhe.py   |    2 +-
 .../boundary_condition/boundary_condition.py  |    2 +-
 .../boundary_condition/helper_functions_bc.py |    4 +-
 xlb/operator/boundary_masker/aabb.py          |    9 +-
 xlb/operator/boundary_masker/aabb_close.py    |   19 +-
 .../indices_boundary_masker.py                |    9 +-
 .../boundary_masker/multires_aabb_close.py    |    9 +-
 xlb/operator/boundary_masker/winding.py       |    3 +-
 xlb/operator/equilibrium/__init__.py          |    2 +-
 ...m.py => multires_quadratic_equilibrium.py} |    0
 xlb/operator/macroscopic/first_moment.py      |    2 +-
 .../macroscopic/multires_macroscopic.py       |    3 +-
 xlb/operator/macroscopic/second_moment.py     |    2 +-
 xlb/operator/macroscopic/zero_moment.py       |    2 +-
 xlb/operator/stepper/nse_multires_stepper.py  |  270 +---
 xlb/operator/stepper/nse_stepper.py           |    7 +-
 xlb/operator/stream/stream.py                 |    2 +-
 xlb/utils/mesher.py                           |    4 +-
 xlb/utils/utils.py                            |   10 +-
 29 files changed, 1031 insertions(+), 1220 deletions(-)
 create mode 100644 xlb/cell_type.py
 rename xlb/operator/equilibrium/{mulltires_quadratic_equilibrium.py => multires_quadratic_equilibrium.py} (100%)

diff --git a/examples/cfd/grid_refinement/ahmed.py b/examples/cfd/grid_refinement/ahmed.py
index 3b258870..59a633fc 100644
--- a/examples/cfd/grid_refinement/ahmed.py
+++ b/examples/cfd/grid_refinement/ahmed.py
@@ -1,566 +1,566 @@
-import neon
-import warp as wp
-import numpy as np
-import time
-import os
-import matplotlib.pyplot as plt
-import trimesh
-import shutil
-
-import xlb
-from xlb.compute_backend import ComputeBackend
-from xlb.precision_policy import PrecisionPolicy
-from xlb.grid import multires_grid_factory
-from xlb.operator.boundary_condition import (
-    DoNothingBC,
-    HybridBC,
-    RegularizedBC,
-)
-from xlb.operator.boundary_masker import MeshVoxelizationMethod
-from xlb.utils.mesher import prepare_sparsity_pattern, make_cuboid_mesh, MultiresIO
-from xlb.utils import UnitConvertor
-from xlb.operator.force import MultiresMomentumTransfer
-from xlb.helper.initializers import CustomMultiresInitializer
-
-wp.clear_kernel_cache()
-wp.config.quiet = True
-
-# User Configuration
-# =================
-# Physical and simulation parameters
-wind_speed_lbm = 0.05  # Lattice velocity
-wind_speed_mps = 38.0  # Physical inlet velocity in m/s (user input)
-flow_passes = 2  # Domain flow passes
-kinematic_viscosity = 1.508e-5  # Kinematic viscosity of air in m^2/s 1.508e-5
-voxel_size = 0.005  # Finest voxel size in meters
-
-# STL filename
-stl_filename = "examples/cfd/stl-files/Ahmed_25_NoLegs.stl"
-script_name = "Ahmed"
-
-# I/O settings
-print_interval_percentage = 1  # Print every 1% of iterations
-file_output_crossover_percentage = 10  # Crossover at 50% of iterations
-num_file_outputs_pre_crossover = 20  # Outputs before crossover
-num_file_outputs_post_crossover = 5  # Outputs after crossover
-
-# Other setup parameters
-compute_backend = ComputeBackend.NEON
-precision_policy = PrecisionPolicy.FP32FP32
-velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=compute_backend)
-
-
-def generate_cuboid_mesh(stl_filename, voxel_size):
-    """
-    Alternative cuboid mesh generation based on Apolo's method with domain multipliers per level.
-    """
-    # Domain multipliers for each refinement level
-    domain_multiplier = [
-        [3.0, 4.0, 2.5, 2.5, 0.0, 4.0],  # -x, x, -y, y, -z, z
-        [1.2, 1.25, 1.75, 1.75, 0.0, 1.5],
-        [0.8, 1.0, 1.25, 1.25, 0.0, 1.2],
-        [0.5, 0.65, 0.6, 0.60, 0.0, 0.6],
-        [0.25, 0.25, 0.25, 0.25, 0.0, 0.25],
-    ]
-
-    # Load the mesh
-    mesh = trimesh.load_mesh(stl_filename, process=False)
-    if mesh.is_empty:
-        raise ValueError("Loaded mesh is empty or invalid.")
-
-    # Compute original bounds
-    min_bound = mesh.vertices.min(axis=0)
-    max_bound = mesh.vertices.max(axis=0)
-    partSize = max_bound - min_bound
-    x0 = max_bound[0]  # End of car for Ahmed
-
-    # Compute translation to put mesh into first octant of the domain
-    stl_shift = np.array(
-        [
-            domain_multiplier[0][0] * partSize[0] - min_bound[0],
-            domain_multiplier[0][2] * partSize[1] - min_bound[1],
-            domain_multiplier[0][4] * partSize[2] - min_bound[2],
-        ],
-        dtype=float,
-    )
-
-    # Apply translation and save out temp STL
-    mesh.apply_translation(stl_shift)
-    _ = mesh.vertex_normals
-    mesh_vertices = np.asarray(mesh.vertices)
-    mesh.export("temp.stl")
-
-    # Generate mesh using make_cuboid_mesh
-    level_data = make_cuboid_mesh(
-        voxel_size,
-        domain_multiplier,
-        "temp.stl",
-    )
-
-    num_levels = len(level_data)
-    grid_shape_finest = tuple([int(i * 2 ** (num_levels - 1)) for i in level_data[-1][0].shape])
-    print(f"Full shape based on finest voxel size is {grid_shape_finest}")
-    os.remove("temp.stl")
-
-    return (
-        level_data,
-        mesh_vertices,
-        tuple([int(a) for a in grid_shape_finest]),
-        stl_shift,
-        x0,
-    )
-
-
-# Boundary Conditions Setup
-# =========================
-def setup_boundary_conditions(grid, level_data, body_vertices, wind_speed_mps):
-    """
-    Set up boundary conditions for the simulation.
-    """
-    # Convert wind speed to lattice units
-    wind_speed_lbm = unit_convertor.velocity_to_lbm(wind_speed_mps)
-
-    left_indices = grid.boundary_indices_across_levels(level_data, box_side="left", remove_edges=True)
-    right_indices = grid.boundary_indices_across_levels(level_data, box_side="right", remove_edges=True)
-    top_indices = grid.boundary_indices_across_levels(level_data, box_side="top", remove_edges=False)
-    bottom_indices = grid.boundary_indices_across_levels(level_data, box_side="bottom", remove_edges=False)
-    front_indices = grid.boundary_indices_across_levels(level_data, box_side="front", remove_edges=False)
-    back_indices = grid.boundary_indices_across_levels(level_data, box_side="back", remove_edges=False)
-
-    # Initialize boundary conditions
-    bc_inlet = RegularizedBC("velocity", prescribed_value=(wind_speed_lbm, 0.0, 0.0), indices=left_indices)
-    bc_outlet = DoNothingBC(indices=right_indices)
-    bc_top = HybridBC(bc_method="nonequilibrium_regularized", indices=top_indices)
-    bc_bottom = HybridBC(bc_method="nonequilibrium_regularized", indices=bottom_indices)
-    bc_front = HybridBC(bc_method="nonequilibrium_regularized", indices=front_indices)
-    bc_back = HybridBC(bc_method="nonequilibrium_regularized", indices=back_indices)
-    bc_body = HybridBC(
-        bc_method="nonequilibrium_regularized",
-        mesh_vertices=unit_convertor.length_to_lbm(body_vertices),
-        voxelization_method=MeshVoxelizationMethod("AABB_CLOSE", close_voxels=4),
-        use_mesh_distance=True,
-    )
-
-    return [bc_top, bc_bottom, bc_front, bc_back, bc_inlet, bc_outlet, bc_body]
-
-
-# Simulation Initialization
-# =========================
-def initialize_simulation(
-    grid, boundary_conditions, omega_finest, initializer, collision_type="KBC", mres_perf_opt=xlb.MresPerfOptimizationType.FUSION_AT_FINEST
-):
-    """
-    Initialize the multiresolution simulation manager.
-    """
-    sim = xlb.helper.MultiresSimulationManager(
-        omega_finest=omega_finest,
-        grid=grid,
-        boundary_conditions=boundary_conditions,
-        collision_type=collision_type,
-        initializer=initializer,
-        mres_perf_opt=mres_perf_opt,
-    )
-    return sim
-
-
-# Utility Functions
-# =================
-def print_lift_drag(sim, step, momentum_transfer, wind_speed_lbm, reference_area):
-    """
-    Calculate and print lift and drag coefficients.
-    """
-    boundary_force = momentum_transfer(sim.f_0, sim.f_1, sim.bc_mask, sim.missing_mask)
-    drag = boundary_force[0]
-    lift = boundary_force[2]
-    cd = 2.0 * drag / (wind_speed_lbm**2 * reference_area)
-    cl = 2.0 * lift / (wind_speed_lbm**2 * reference_area)
-    if np.isnan(cd) or np.isnan(cl):
-        print(f"NaN detected in coefficients at step {step}")
-        raise ValueError(f"NaN detected in coefficients at step {step}: Cd={cd}, Cl={cl}")
-    drag_values.append([cd, cl])
-    return cd, cl, drag
-
-
-def plot_drag_lift(drag_values, output_dir, print_interval, script_name, percentile_range=(15, 85), use_log_scale=False):
-    """
-    Plot CD and CL over time and save the plot to the output directory.
-    """
-    drag_values_array = np.array(drag_values)
-    steps = np.arange(0, len(drag_values) * print_interval, print_interval)
-    cd_values = drag_values_array[:, 0]
-    cl_values = drag_values_array[:, 1]
-    y_min = min(np.percentile(cd_values, percentile_range[0]), np.percentile(cl_values, percentile_range[0]))
-    y_max = max(np.percentile(cd_values, percentile_range[1]), np.percentile(cl_values, percentile_range[1]))
-    padding = (y_max - y_min) * 0.1
-    y_min, y_max = y_min - padding, y_max + padding
-    if use_log_scale:
-        y_min = max(y_min, 1e-6)
-    plt.figure(figsize=(10, 6))
-    plt.plot(steps, cd_values, label="Drag Coefficient (Cd)", color="blue")
-    plt.plot(steps, cl_values, label="Lift Coefficient (Cl)", color="red")
-    plt.xlabel("Simulation Step")
-    plt.ylabel("Coefficient")
-    plt.title(f"{script_name}: Drag and Lift Coefficients Over Time")
-    plt.legend()
-    plt.grid(True)
-    plt.ylim(y_min, y_max)
-    if use_log_scale:
-        plt.yscale("log")
-    plt.savefig(os.path.join(output_dir, "drag_lift_plot.png"))
-    plt.close()
-
-
-def compute_voxel_statistics(sim, bc_mask_exporter, sparsity_pattern, boundary_conditions, unit_convertor):
-    """
-    Compute active/solid voxels, totals, lattice updates, and reference area based on simulation data.
-    """
-    fields_data = bc_mask_exporter.get_fields_data({"bc_mask": sim.bc_mask})
-    bc_mask_data = fields_data["bc_mask_0"]
-    level_id_field = bc_mask_exporter.level_id_field
-
-    # Compute solid voxels per level (assuming 255 is the solid marker)
-    solid_voxels = []
-    for lvl in range(num_levels):
-        level_mask = level_id_field == lvl
-        solid_voxels.append(np.sum(bc_mask_data[level_mask] == 255))
-
-    # Compute active voxels (total non-zero in sparsity minus solids)
-    active_voxels = [np.count_nonzero(mask) for mask in sparsity_pattern]
-    active_voxels = [max(0, active_voxels[lvl] - solid_voxels[lvl]) for lvl in range(num_levels)]
-
-    # Totals
-    total_voxels = sum(active_voxels)
-    total_lattice_updates_per_step = sum(active_voxels[lvl] * (2 ** (num_levels - 1 - lvl)) for lvl in range(num_levels))
-
-    # Compute reference area (projected on YZ plane at finest level)
-    finest_level = 0
-    mask_finest = level_id_field == finest_level
-    bc_mask_finest = bc_mask_data[mask_finest]
-    active_indices_finest = np.argwhere(sparsity_pattern[0])
-    bc_body_id = boundary_conditions[-1].id  # Assuming last BC is bc_body
-    solid_voxels_indices = active_indices_finest[bc_mask_finest == bc_body_id]
-    unique_jk = np.unique(solid_voxels_indices[:, 1:3], axis=0)
-    reference_area = unique_jk.shape[0]
-    reference_area_physical = reference_area * unit_convertor.reference_length**2
-
-    return {
-        "active_voxels": active_voxels,
-        "solid_voxels": solid_voxels,
-        "total_voxels": total_voxels,
-        "total_lattice_updates_per_step": total_lattice_updates_per_step,
-        "reference_area": reference_area,
-        "reference_area_physical": reference_area_physical,
-    }
-
-
-def plot_data(x0, output_dir, delta_x_coarse, sim, IOexporter, prefix="Ahmed"):
-    """
-    Ahmed Car Model, slant - angle = 25 degree
-    Profiles on symmetry plane (y=0) covering entire field
-    Origin of coordinate system:
-         x=0: end of the car, y=0: symmetry plane, z=0: ground plane
-
-    S.Becker/H. Lienhart/C.Stoots
-    Insitute of Fluid Mechanics
-    University Erlangen-Nuremberg
-    Erlangen, Germany
-    Coordaintes in meters need to convert to voxels
-    Velocity data in m/s
-    """
-
-    def _load_sim_line(csv_path):
-        """
-        Read a CSV exported by IOexporter.to_line without pandas.
-        Returns (z, Ux).
-        """
-        # Read with header as column names
-        data = np.genfromtxt(
-            csv_path,
-            delimiter=",",
-            names=True,
-            autostrip=True,
-            dtype=None,
-            encoding="utf-8",
-        )
-        if data.size == 0:
-            raise ValueError(f"No data in {csv_path}")
-
-        z = np.asarray(data["z"], dtype=float)
-        ux = np.asarray(data["value"], dtype=float)
-        return z, ux
-
-    # Load reference data
-    import json
-
-    ref_data_path = "examples/cfd/data/ahmed.json"
-    with open(ref_data_path, "r") as file:
-        data = json.load(file)
-
-    for x_str in data["data"].keys():
-        # Extract reference horizontal velocity in m/s and its corresponding height in m
-        refX = np.array(data["data"][x_str]["x-velocity"])
-        refY = np.array(data["data"][x_str]["height"])
-
-        # From reference x0 (rear of body) find x1 for plot
-        x_pos = float(x_str)
-        x1 = x0 + x_pos
-
-        print(f" x1 is {x1}")
-        sim.macro(sim.f_0, sim.bc_mask, sim.rho, sim.u, streamId=0)
-        filename = os.path.join(output_dir, f"{prefix}_{x_str}")
-        wp.synchronize()
-        IOexporter.to_line(
-            filename,
-            {"velocity": sim.u},
-            start_point=(x1, 0, 0),
-            end_point=(x1, 0, 0.8),
-            resolution=250,
-            component=0,
-            radius=delta_x_coarse,  # needed with model units
-        )
-        # read the CSV written by the exporter
-        csv_path = filename + "_velocity_0.csv"
-        print(f"CSV path is {csv_path}")
-
-        try:
-            sim_z, sim_ux = _load_sim_line(csv_path)
-        except Exception as e:
-            print(f"Failed to read {csv_path}: {e}")
-            continue
-
-        # plot reference vs simulation
-        plt.figure(figsize=(4.5, 6))
-        plt.plot(refX, refY, "o", mfc="none", label="Experimental)")
-        plt.plot(sim_ux, sim_z, "-", lw=2, label="Simulation")
-        plt.xlim(np.min(refX) * 0.9, np.max(refX) * 1.1)
-        plt.ylim(np.min(refY), np.max(refY))
-        plt.xlabel("Ux [m/s]")
-        plt.ylabel("z [m]")
-        plt.title(f"Velocity Plot at {x_pos:+.3f}")
-        plt.grid(True, alpha=0.3)
-        plt.legend()
-        plt.tight_layout()
-        plt.savefig(filename + ".png", dpi=150)
-        plt.close()
-
-
-# Main Script
-# ===========
-# Initialize XLB
-
-xlb.init(
-    velocity_set=velocity_set,
-    default_backend=compute_backend,
-    default_precision_policy=precision_policy,
-)
-
-# Generate mesh
-level_data, body_vertices, grid_shape_zip, stl_shift, x0 = generate_cuboid_mesh(stl_filename, voxel_size)
-
-# Prepare the sparsity pattern and origins from the level data
-sparsity_pattern, level_origins = prepare_sparsity_pattern(level_data)
-
-# Define a unit convertor
-unit_convertor = UnitConvertor(
-    velocity_lbm_unit=wind_speed_lbm,
-    velocity_physical_unit=wind_speed_mps,
-    voxel_size_physical_unit=voxel_size,
-)
-
-# Calculate lattice parameters
-num_levels = len(level_data)
-delta_x_coarse = voxel_size * 2 ** (num_levels - 1)
-nu_lattice = unit_convertor.viscosity_to_lbm(kinematic_viscosity)
-omega_finest = 1.0 / (3.0 * nu_lattice + 0.5)
-
-# Create output directory
-current_dir = os.path.join(os.path.dirname(__file__))
-output_dir = os.path.join(current_dir, script_name)
-if os.path.exists(output_dir):
-    shutil.rmtree(output_dir)
-os.makedirs(output_dir)
-
-# Define exporter objects
-field_name_cardinality_dict = {"velocity": 3, "density": 1}
-h5exporter = MultiresIO(
-    field_name_cardinality_dict,
-    level_data,
-    offset=-stl_shift,
-    unit_convertor=unit_convertor,
-)
-bc_mask_exporter = MultiresIO(
-    {"bc_mask": 1},
-    level_data,
-    offset=-stl_shift,
-    unit_convertor=unit_convertor,
-)
-
-# Create grid
-grid = multires_grid_factory(
-    grid_shape_zip,
-    velocity_set=velocity_set,
-    sparsity_pattern_list=sparsity_pattern,
-    sparsity_pattern_origins=[neon.Index_3d(*box_origin) for box_origin in level_origins],
-)
-
-# Calculate num_steps
-coarsest_level = grid.count_levels - 1
-grid_shape_x_coarsest = grid.level_to_shape(coarsest_level)[0]
-num_steps = int(flow_passes * (grid_shape_x_coarsest / wind_speed_lbm))
-
-# Calculate print and file output intervals
-print_interval = max(1, int(num_steps * (print_interval_percentage / 100.0)))
-crossover_step = int(num_steps * (file_output_crossover_percentage / 100.0))
-file_output_interval_pre_crossover = (
-    max(1, int(crossover_step / num_file_outputs_pre_crossover)) if num_file_outputs_pre_crossover > 0 else num_steps + 1
-)
-file_output_interval_post_crossover = (
-    max(1, int((num_steps - crossover_step) / num_file_outputs_post_crossover)) if num_file_outputs_post_crossover > 0 else num_steps + 1
-)
-
-# Setup boundary conditions
-boundary_conditions = setup_boundary_conditions(grid, level_data, body_vertices, wind_speed_mps)
-
-# Create initializer
-wind_speed_lbm = unit_convertor.velocity_to_lbm(wind_speed_mps)
-initializer = CustomMultiresInitializer(
-    bc_id=boundary_conditions[-2].id,  # bc_outlet
-    constant_velocity_vector=(wind_speed_lbm, 0.0, 0.0),
-    velocity_set=velocity_set,
-    precision_policy=precision_policy,
-    compute_backend=compute_backend,
-)
-
-# Initialize simulation
-sim = initialize_simulation(grid, boundary_conditions, omega_finest, initializer)
-
-# Compute voxel statistics and reference area
-stats = compute_voxel_statistics(sim, bc_mask_exporter, sparsity_pattern, boundary_conditions, unit_convertor)
-active_voxels = stats["active_voxels"]
-solid_voxels = stats["solid_voxels"]
-total_voxels = stats["total_voxels"]
-total_lattice_updates_per_step = stats["total_lattice_updates_per_step"]
-reference_area = stats["reference_area"]
-reference_area_physical = stats["reference_area_physical"]
-
-# Save initial bc_mask
-filename = os.path.join(output_dir, f"{script_name}_initial_bc_mask")
-try:
-    bc_mask_exporter.to_hdf5(filename, {"bc_mask": sim.bc_mask}, compression="gzip", compression_opts=0)
-    xmf_filename = f"{filename}.xmf"
-    hdf5_basename = f"{script_name}_initial_bc_mask.h5"
-except Exception as e:
-    print(f"Error during initial bc_mask output: {e}")
-wp.synchronize()
-
-
-# Setup momentum transfer
-momentum_transfer = MultiresMomentumTransfer(
-    boundary_conditions[-1],
-    mres_perf_opt=xlb.MresPerfOptimizationType.FUSION_AT_FINEST,
-    compute_backend=compute_backend,
-)
-
-# Print simulation info
-print("\n" + "=" * 50 + "\n")
-print(f"Number of flow passes: {flow_passes}")
-print(f"Calculated iterations: {num_steps:,}")
-print(f"Finest voxel size: {voxel_size} meters")
-print(f"Coarsest voxel size: {delta_x_coarse} meters")
-print(f"Total voxels: {sum(np.count_nonzero(mask) for mask in sparsity_pattern):,}")
-print(f"Total active voxels: {total_voxels:,}")
-print(f"Active voxels per level: {active_voxels}")
-print(f"Solid voxels per level: {solid_voxels}")
-print(f"Total lattice updates per global step: {total_lattice_updates_per_step:,}")
-print(f"Number of refinement levels: {num_levels}")
-print(f"Physical inlet velocity: {wind_speed_mps:.4f} m/s")
-print(f"Lattice velocity (ulb): {wind_speed_lbm}")
-print(f"Computed reference area (bc_mask): {reference_area} lattice units")
-print(f"Physical reference area (bc_mask): {reference_area_physical:.6f} m^2")
-print("\n" + "=" * 50 + "\n")
-
-# -------------------------- Simulation Loop --------------------------
-wp.synchronize()
-start_time = time.time()
-compute_time = 0.0
-steps_since_last_print = 0
-drag_values = []
-
-for step in range(num_steps):
-    step_start = time.time()
-    sim.step()
-    wp.synchronize()
-    compute_time += time.time() - step_start
-    steps_since_last_print += 1
-    if step % print_interval == 0 or step == num_steps - 1:
-        sim.macro(sim.f_0, sim.bc_mask, sim.rho, sim.u, streamId=0)
-        wp.synchronize()
-        cd, cl, drag = print_lift_drag(sim, step, momentum_transfer, wind_speed_lbm, reference_area)
-        filename = os.path.join(output_dir, f"{script_name}_{step:04d}")
-        h5exporter.to_hdf5(filename, {"velocity": sim.u, "density": sim.rho}, compression="gzip", compression_opts=0)
-        h5exporter.to_slice_image(
-            filename,
-            {"velocity": sim.u},
-            plane_point=(1, 0, 0),
-            plane_normal=(0, 1, 0),
-            grid_res=2000,
-            bounds=(0.25, 0.75, 0, 0.5),
-            show_axes=False,
-            show_colorbar=False,
-            slice_thickness=delta_x_coarse,  # needed when using model units
-        )
-        end_time = time.time()
-        elapsed = end_time - start_time
-        total_lattice_updates = total_lattice_updates_per_step * steps_since_last_print
-        MLUPS = total_lattice_updates / compute_time / 1e6 if compute_time > 0 else 0.0
-        current_flow_passes = step * wind_speed_lbm / grid_shape_x_coarsest
-        remaining_steps = num_steps - step - 1
-        time_remaining = 0.0 if MLUPS == 0 else (total_lattice_updates_per_step * remaining_steps) / (MLUPS * 1e6)
-        hours, rem = divmod(time_remaining, 3600)
-        minutes, seconds = divmod(rem, 60)
-        time_remaining_str = f"{int(hours):02d}h {int(minutes):02d}m {int(seconds):02d}s"
-        percent_complete = (step + 1) / num_steps * 100
-        print(f"Completed step {step}/{num_steps} ({percent_complete:.2f}% complete)")
-        print(f"  Flow Passes: {current_flow_passes:.2f}")
-        print(f"  Time elapsed: {elapsed:.1f}s, Compute time: {compute_time:.1f}s, ETA: {time_remaining_str}")
-        print(f"  MLUPS: {MLUPS:.1f}")
-        print(f"  Cd={cd:.3f}, Cl={cl:.3f}, Drag Force (lattice units)={drag:.3f}")
-        start_time = time.time()
-        compute_time = 0.0
-        steps_since_last_print = 0
-    file_output_interval = file_output_interval_pre_crossover if step < crossover_step else file_output_interval_post_crossover
-    if step % file_output_interval == 0 or step == num_steps - 1:
-        sim.macro(sim.f_0, sim.bc_mask, sim.rho, sim.u, streamId=0)
-        filename = os.path.join(output_dir, f"{script_name}_{step:04d}")
-        try:
-            h5exporter.to_hdf5(filename, {"velocity": sim.u, "density": sim.rho}, compression="gzip", compression_opts=0)
-            xmf_filename = f"{filename}.xmf"
-            hdf5_basename = f"{script_name}_{step:04d}.h5"
-        except Exception as e:
-            print(f"Error during file output at step {step}: {e}")
-        wp.synchronize()
-    if step == num_steps - 1:
-        plot_data(x0, output_dir, delta_x_coarse, sim, h5exporter, prefix="Ahmed")
-
-# Save drag and lift data to CSV
-if len(drag_values) > 0:
-    with open(os.path.join(output_dir, "drag_lift.csv"), "w") as fd:
-        fd.write("Step,Cd,Cl\n")
-        for i, (cd, cl) in enumerate(drag_values):
-            fd.write(f"{i * print_interval},{cd},{cl}\n")
-    plot_drag_lift(drag_values, output_dir, print_interval, script_name)
-
-# Calculate and print average Cd and Cl for the last 50%
-drag_values_array = np.array(drag_values)
-if len(drag_values) > 0:
-    start_index = len(drag_values) // 2
-    last_half = drag_values_array[start_index:, :]
-    avg_cd = np.mean(last_half[:, 0])
-    avg_cl = np.mean(last_half[:, 1])
-    print(f"Average Drag Coefficient (Cd) for last 50%: {avg_cd:.6f}")
-    print(f"Average Lift Coefficient (Cl) for last 50%: {avg_cl:.6f}")
-    print(f"Experimental Drag Coefficient (Cd): {0.3088}")
-    print(f"Error Drag Coefficient (Cd): {((avg_cd - 0.3088) / 0.3088) * 100:.2f}%")
-
-else:
-    print("No drag or lift data collected.")
+import neon
+import warp as wp
+import numpy as np
+import time
+import os
+import matplotlib.pyplot as plt
+import trimesh
+import shutil
+
+import xlb
+from xlb.compute_backend import ComputeBackend
+from xlb.precision_policy import PrecisionPolicy
+from xlb.grid import multires_grid_factory
+from xlb.operator.boundary_condition import (
+    DoNothingBC,
+    HybridBC,
+    RegularizedBC,
+)
+from xlb.operator.boundary_masker import MeshVoxelizationMethod
+from xlb.utils.mesher import prepare_sparsity_pattern, make_cuboid_mesh, MultiresIO
+from xlb.utils import UnitConvertor
+from xlb.operator.force import MultiresMomentumTransfer
+from xlb.helper.initializers import CustomMultiresInitializer
+
+wp.clear_kernel_cache()
+wp.config.quiet = True
+
+# User Configuration
+# =================
+# Physical and simulation parameters
+wind_speed_lbm = 0.05  # Lattice velocity
+wind_speed_mps = 38.0  # Physical inlet velocity in m/s (user input)
+flow_passes = 2  # Domain flow passes
+kinematic_viscosity = 1.508e-5  # Kinematic viscosity of air in m^2/s 1.508e-5
+voxel_size = 0.005  # Finest voxel size in meters
+
+# STL filename
+stl_filename = "examples/cfd/stl-files/Ahmed_25_NoLegs.stl"
+script_name = "Ahmed"
+
+# I/O settings
+print_interval_percentage = 1  # Print every 1% of iterations
+file_output_crossover_percentage = 10  # Crossover at 50% of iterations
+num_file_outputs_pre_crossover = 20  # Outputs before crossover
+num_file_outputs_post_crossover = 5  # Outputs after crossover
+
+# Other setup parameters
+compute_backend = ComputeBackend.NEON
+precision_policy = PrecisionPolicy.FP32FP32
+velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=compute_backend)
+
+
+def generate_cuboid_mesh(stl_filename, voxel_size):
+    """
+    Alternative cuboid mesh generation based on Apolo's method with domain multipliers per level.
+    """
+    # Domain multipliers for each refinement level
+    domain_multiplier = [
+        [3.0, 4.0, 2.5, 2.5, 0.0, 4.0],  # -x, x, -y, y, -z, z
+        [1.2, 1.25, 1.75, 1.75, 0.0, 1.5],
+        [0.8, 1.0, 1.25, 1.25, 0.0, 1.2],
+        [0.5, 0.65, 0.6, 0.60, 0.0, 0.6],
+        [0.25, 0.25, 0.25, 0.25, 0.0, 0.25],
+    ]
+
+    # Load the mesh
+    mesh = trimesh.load_mesh(stl_filename, process=False)
+    if mesh.is_empty:
+        raise ValueError("Loaded mesh is empty or invalid.")
+
+    # Compute original bounds
+    min_bound = mesh.vertices.min(axis=0)
+    max_bound = mesh.vertices.max(axis=0)
+    partSize = max_bound - min_bound
+    x0 = max_bound[0]  # End of car for Ahmed
+
+    # Compute translation to put mesh into first octant of the domain
+    stl_shift = np.array(
+        [
+            domain_multiplier[0][0] * partSize[0] - min_bound[0],
+            domain_multiplier[0][2] * partSize[1] - min_bound[1],
+            domain_multiplier[0][4] * partSize[2] - min_bound[2],
+        ],
+        dtype=float,
+    )
+
+    # Apply translation and save out temp STL
+    mesh.apply_translation(stl_shift)
+    _ = mesh.vertex_normals
+    mesh_vertices = np.asarray(mesh.vertices)
+    mesh.export("temp.stl")
+
+    # Generate mesh using make_cuboid_mesh
+    level_data = make_cuboid_mesh(
+        voxel_size,
+        domain_multiplier,
+        "temp.stl",
+    )
+
+    num_levels = len(level_data)
+    grid_shape_finest = tuple([int(i * 2 ** (num_levels - 1)) for i in level_data[-1][0].shape])
+    print(f"Full shape based on finest voxel size is {grid_shape_finest}")
+    os.remove("temp.stl")
+
+    return (
+        level_data,
+        mesh_vertices,
+        tuple([int(a) for a in grid_shape_finest]),
+        stl_shift,
+        x0,
+    )
+
+
+# Boundary Conditions Setup
+# =========================
+def setup_boundary_conditions(grid, level_data, body_vertices, wind_speed_mps):
+    """
+    Set up boundary conditions for the simulation.
+    """
+    # Convert wind speed to lattice units
+    wind_speed_lbm = unit_convertor.velocity_to_lbm(wind_speed_mps)
+
+    left_indices = grid.boundary_indices_across_levels(level_data, box_side="left", remove_edges=True)
+    right_indices = grid.boundary_indices_across_levels(level_data, box_side="right", remove_edges=True)
+    top_indices = grid.boundary_indices_across_levels(level_data, box_side="top", remove_edges=False)
+    bottom_indices = grid.boundary_indices_across_levels(level_data, box_side="bottom", remove_edges=False)
+    front_indices = grid.boundary_indices_across_levels(level_data, box_side="front", remove_edges=False)
+    back_indices = grid.boundary_indices_across_levels(level_data, box_side="back", remove_edges=False)
+
+    # Initialize boundary conditions
+    bc_inlet = RegularizedBC("velocity", prescribed_value=(wind_speed_lbm, 0.0, 0.0), indices=left_indices)
+    bc_outlet = DoNothingBC(indices=right_indices)
+    bc_top = HybridBC(bc_method="nonequilibrium_regularized", indices=top_indices)
+    bc_bottom = HybridBC(bc_method="nonequilibrium_regularized", indices=bottom_indices)
+    bc_front = HybridBC(bc_method="nonequilibrium_regularized", indices=front_indices)
+    bc_back = HybridBC(bc_method="nonequilibrium_regularized", indices=back_indices)
+    bc_body = HybridBC(
+        bc_method="nonequilibrium_regularized",
+        mesh_vertices=unit_convertor.length_to_lbm(body_vertices),
+        voxelization_method=MeshVoxelizationMethod("AABB_CLOSE", close_voxels=4),
+        use_mesh_distance=True,
+    )
+
+    return [bc_top, bc_bottom, bc_front, bc_back, bc_inlet, bc_outlet, bc_body]
+
+
+# Simulation Initialization
+# =========================
+def initialize_simulation(
+    grid, boundary_conditions, omega_finest, initializer, collision_type="KBC", mres_perf_opt=xlb.MresPerfOptimizationType.FUSION_AT_FINEST
+):
+    """
+    Initialize the multiresolution simulation manager.
+    """
+    sim = xlb.helper.MultiresSimulationManager(
+        omega_finest=omega_finest,
+        grid=grid,
+        boundary_conditions=boundary_conditions,
+        collision_type=collision_type,
+        initializer=initializer,
+        mres_perf_opt=mres_perf_opt,
+    )
+    return sim
+
+
+# Utility Functions
+# =================
+def print_lift_drag(sim, step, momentum_transfer, wind_speed_lbm, reference_area):
+    """
+    Calculate and print lift and drag coefficients.
+    """
+    boundary_force = momentum_transfer(sim.f_0, sim.f_1, sim.bc_mask, sim.missing_mask)
+    drag = boundary_force[0]
+    lift = boundary_force[2]
+    cd = 2.0 * drag / (wind_speed_lbm**2 * reference_area)
+    cl = 2.0 * lift / (wind_speed_lbm**2 * reference_area)
+    if np.isnan(cd) or np.isnan(cl):
+        print(f"NaN detected in coefficients at step {step}")
+        raise ValueError(f"NaN detected in coefficients at step {step}: Cd={cd}, Cl={cl}")
+    drag_values.append([cd, cl])
+    return cd, cl, drag
+
+
+def plot_drag_lift(drag_values, output_dir, print_interval, script_name, percentile_range=(15, 85), use_log_scale=False):
+    """
+    Plot CD and CL over time and save the plot to the output directory.
+    """
+    drag_values_array = np.array(drag_values)
+    steps = np.arange(0, len(drag_values) * print_interval, print_interval)
+    cd_values = drag_values_array[:, 0]
+    cl_values = drag_values_array[:, 1]
+    y_min = min(np.percentile(cd_values, percentile_range[0]), np.percentile(cl_values, percentile_range[0]))
+    y_max = max(np.percentile(cd_values, percentile_range[1]), np.percentile(cl_values, percentile_range[1]))
+    padding = (y_max - y_min) * 0.1
+    y_min, y_max = y_min - padding, y_max + padding
+    if use_log_scale:
+        y_min = max(y_min, 1e-6)
+    plt.figure(figsize=(10, 6))
+    plt.plot(steps, cd_values, label="Drag Coefficient (Cd)", color="blue")
+    plt.plot(steps, cl_values, label="Lift Coefficient (Cl)", color="red")
+    plt.xlabel("Simulation Step")
+    plt.ylabel("Coefficient")
+    plt.title(f"{script_name}: Drag and Lift Coefficients Over Time")
+    plt.legend()
+    plt.grid(True)
+    plt.ylim(y_min, y_max)
+    if use_log_scale:
+        plt.yscale("log")
+    plt.savefig(os.path.join(output_dir, "drag_lift_plot.png"))
+    plt.close()
+
+
+def compute_voxel_statistics(sim, bc_mask_exporter, sparsity_pattern, boundary_conditions, unit_convertor):
+    """
+    Compute active/solid voxels, totals, lattice updates, and reference area based on simulation data.
+    """
+    fields_data = bc_mask_exporter.get_fields_data({"bc_mask": sim.bc_mask})
+    bc_mask_data = fields_data["bc_mask_0"]
+    level_id_field = bc_mask_exporter.level_id_field
+
+    # Compute solid voxels per level (assuming 255 is the solid marker)
+    solid_voxels = []
+    for lvl in range(num_levels):
+        level_mask = level_id_field == lvl
+        solid_voxels.append(np.sum(bc_mask_data[level_mask] == 255))
+
+    # Compute active voxels (total non-zero in sparsity minus solids)
+    active_voxels = [np.count_nonzero(mask) for mask in sparsity_pattern]
+    active_voxels = [max(0, active_voxels[lvl] - solid_voxels[lvl]) for lvl in range(num_levels)]
+
+    # Totals
+    total_voxels = sum(active_voxels)
+    total_lattice_updates_per_step = sum(active_voxels[lvl] * (2 ** (num_levels - 1 - lvl)) for lvl in range(num_levels))
+
+    # Compute reference area (projected on YZ plane at finest level)
+    finest_level = 0
+    mask_finest = level_id_field == finest_level
+    bc_mask_finest = bc_mask_data[mask_finest]
+    active_indices_finest = np.argwhere(sparsity_pattern[0])
+    bc_body_id = boundary_conditions[-1].id  # Assuming last BC is bc_body
+    solid_voxels_indices = active_indices_finest[bc_mask_finest == bc_body_id]
+    unique_jk = np.unique(solid_voxels_indices[:, 1:3], axis=0)
+    reference_area = unique_jk.shape[0]
+    reference_area_physical = reference_area * unit_convertor.reference_length**2
+
+    return {
+        "active_voxels": active_voxels,
+        "solid_voxels": solid_voxels,
+        "total_voxels": total_voxels,
+        "total_lattice_updates_per_step": total_lattice_updates_per_step,
+        "reference_area": reference_area,
+        "reference_area_physical": reference_area_physical,
+    }
+
+
+def plot_data(x0, output_dir, delta_x_coarse, sim, IOexporter, prefix="Ahmed"):
+    """
+    Ahmed Car Model, slant - angle = 25 degree
+    Profiles on symmetry plane (y=0) covering entire field
+    Origin of coordinate system:
+         x=0: end of the car, y=0: symmetry plane, z=0: ground plane
+
+    S.Becker/H. Lienhart/C.Stoots
+    Insitute of Fluid Mechanics
+    University Erlangen-Nuremberg
+    Erlangen, Germany
+    Coordaintes in meters need to convert to voxels
+    Velocity data in m/s
+    """
+
+    def _load_sim_line(csv_path):
+        """
+        Read a CSV exported by IOexporter.to_line without pandas.
+        Returns (z, Ux).
+        """
+        # Read with header as column names
+        data = np.genfromtxt(
+            csv_path,
+            delimiter=",",
+            names=True,
+            autostrip=True,
+            dtype=None,
+            encoding="utf-8",
+        )
+        if data.size == 0:
+            raise ValueError(f"No data in {csv_path}")
+
+        z = np.asarray(data["z"], dtype=float)
+        ux = np.asarray(data["value"], dtype=float)
+        return z, ux
+
+    # Load reference data
+    import json
+
+    ref_data_path = "examples/cfd/data/ahmed.json"
+    with open(ref_data_path, "r") as file:
+        data = json.load(file)
+
+    for x_str in data["data"].keys():
+        # Extract reference horizontal velocity in m/s and its corresponding height in m
+        refX = np.array(data["data"][x_str]["x-velocity"])
+        refY = np.array(data["data"][x_str]["height"])
+
+        # From reference x0 (rear of body) find x1 for plot
+        x_pos = float(x_str)
+        x1 = x0 + x_pos
+
+        print(f" x1 is {x1}")
+        sim.macro(sim.f_0, sim.bc_mask, sim.rho, sim.u, streamId=0)
+        filename = os.path.join(output_dir, f"{prefix}_{x_str}")
+        wp.synchronize()
+        IOexporter.to_line(
+            filename,
+            {"velocity": sim.u},
+            start_point=(x1, 0, 0),
+            end_point=(x1, 0, 0.8),
+            resolution=250,
+            component=0,
+            radius=delta_x_coarse,  # needed with model units
+        )
+        # read the CSV written by the exporter
+        csv_path = filename + "_velocity_0.csv"
+        print(f"CSV path is {csv_path}")
+
+        try:
+            sim_z, sim_ux = _load_sim_line(csv_path)
+        except Exception as e:
+            print(f"Failed to read {csv_path}: {e}")
+            continue
+
+        # plot reference vs simulation
+        plt.figure(figsize=(4.5, 6))
+        plt.plot(refX, refY, "o", mfc="none", label="Experimental)")
+        plt.plot(sim_ux, sim_z, "-", lw=2, label="Simulation")
+        plt.xlim(np.min(refX) * 0.9, np.max(refX) * 1.1)
+        plt.ylim(np.min(refY), np.max(refY))
+        plt.xlabel("Ux [m/s]")
+        plt.ylabel("z [m]")
+        plt.title(f"Velocity Plot at {x_pos:+.3f}")
+        plt.grid(True, alpha=0.3)
+        plt.legend()
+        plt.tight_layout()
+        plt.savefig(filename + ".png", dpi=150)
+        plt.close()
+
+
+# Main Script
+# ===========
+# Initialize XLB
+
+xlb.init(
+    velocity_set=velocity_set,
+    default_backend=compute_backend,
+    default_precision_policy=precision_policy,
+)
+
+# Generate mesh
+level_data, body_vertices, grid_shape_zip, stl_shift, x0 = generate_cuboid_mesh(stl_filename, voxel_size)
+
+# Prepare the sparsity pattern and origins from the level data
+sparsity_pattern, level_origins = prepare_sparsity_pattern(level_data)
+
+# Define a unit convertor
+unit_convertor = UnitConvertor(
+    velocity_lbm_unit=wind_speed_lbm,
+    velocity_physical_unit=wind_speed_mps,
+    voxel_size_physical_unit=voxel_size,
+)
+
+# Calculate lattice parameters
+num_levels = len(level_data)
+delta_x_coarse = voxel_size * 2 ** (num_levels - 1)
+nu_lattice = unit_convertor.viscosity_to_lbm(kinematic_viscosity)
+omega_finest = 1.0 / (3.0 * nu_lattice + 0.5)
+
+# Create output directory
+current_dir = os.path.join(os.path.dirname(__file__))
+output_dir = os.path.join(current_dir, script_name)
+if os.path.exists(output_dir):
+    shutil.rmtree(output_dir)
+os.makedirs(output_dir)
+
+# Define exporter objects
+field_name_cardinality_dict = {"velocity": 3, "density": 1}
+h5exporter = MultiresIO(
+    field_name_cardinality_dict,
+    level_data,
+    offset=-stl_shift,
+    unit_convertor=unit_convertor,
+)
+bc_mask_exporter = MultiresIO(
+    {"bc_mask": 1},
+    level_data,
+    offset=-stl_shift,
+    unit_convertor=unit_convertor,
+)
+
+# Create grid
+grid = multires_grid_factory(
+    grid_shape_zip,
+    velocity_set=velocity_set,
+    sparsity_pattern_list=sparsity_pattern,
+    sparsity_pattern_origins=[neon.Index_3d(*box_origin) for box_origin in level_origins],
+)
+
+# Calculate num_steps
+coarsest_level = grid.count_levels - 1
+grid_shape_x_coarsest = grid.level_to_shape(coarsest_level)[0]
+num_steps = int(flow_passes * (grid_shape_x_coarsest / wind_speed_lbm))
+
+# Calculate print and file output intervals
+print_interval = max(1, int(num_steps * (print_interval_percentage / 100.0)))
+crossover_step = int(num_steps * (file_output_crossover_percentage / 100.0))
+file_output_interval_pre_crossover = (
+    max(1, int(crossover_step / num_file_outputs_pre_crossover)) if num_file_outputs_pre_crossover > 0 else num_steps + 1
+)
+file_output_interval_post_crossover = (
+    max(1, int((num_steps - crossover_step) / num_file_outputs_post_crossover)) if num_file_outputs_post_crossover > 0 else num_steps + 1
+)
+
+# Setup boundary conditions
+boundary_conditions = setup_boundary_conditions(grid, level_data, body_vertices, wind_speed_mps)
+
+# Create initializer
+wind_speed_lbm = unit_convertor.velocity_to_lbm(wind_speed_mps)
+initializer = CustomMultiresInitializer(
+    bc_id=boundary_conditions[-2].id,  # bc_outlet
+    constant_velocity_vector=(wind_speed_lbm, 0.0, 0.0),
+    velocity_set=velocity_set,
+    precision_policy=precision_policy,
+    compute_backend=compute_backend,
+)
+
+# Initialize simulation
+sim = initialize_simulation(grid, boundary_conditions, omega_finest, initializer)
+
+# Compute voxel statistics and reference area
+stats = compute_voxel_statistics(sim, bc_mask_exporter, sparsity_pattern, boundary_conditions, unit_convertor)
+active_voxels = stats["active_voxels"]
+solid_voxels = stats["solid_voxels"]
+total_voxels = stats["total_voxels"]
+total_lattice_updates_per_step = stats["total_lattice_updates_per_step"]
+reference_area = stats["reference_area"]
+reference_area_physical = stats["reference_area_physical"]
+
+# Save initial bc_mask
+filename = os.path.join(output_dir, f"{script_name}_initial_bc_mask")
+try:
+    bc_mask_exporter.to_hdf5(filename, {"bc_mask": sim.bc_mask}, compression="gzip", compression_opts=0)
+    xmf_filename = f"{filename}.xmf"
+    hdf5_basename = f"{script_name}_initial_bc_mask.h5"
+except Exception as e:
+    print(f"Error during initial bc_mask output: {e}")
+wp.synchronize()
+
+
+# Setup momentum transfer
+momentum_transfer = MultiresMomentumTransfer(
+    boundary_conditions[-1],
+    mres_perf_opt=xlb.MresPerfOptimizationType.FUSION_AT_FINEST,
+    compute_backend=compute_backend,
+)
+
+# Print simulation info
+print("\n" + "=" * 50 + "\n")
+print(f"Number of flow passes: {flow_passes}")
+print(f"Calculated iterations: {num_steps:,}")
+print(f"Finest voxel size: {voxel_size} meters")
+print(f"Coarsest voxel size: {delta_x_coarse} meters")
+print(f"Total voxels: {sum(np.count_nonzero(mask) for mask in sparsity_pattern):,}")
+print(f"Total active voxels: {total_voxels:,}")
+print(f"Active voxels per level: {active_voxels}")
+print(f"Solid voxels per level: {solid_voxels}")
+print(f"Total lattice updates per global step: {total_lattice_updates_per_step:,}")
+print(f"Number of refinement levels: {num_levels}")
+print(f"Physical inlet velocity: {wind_speed_mps:.4f} m/s")
+print(f"Lattice velocity (ulb): {wind_speed_lbm}")
+print(f"Computed reference area (bc_mask): {reference_area} lattice units")
+print(f"Physical reference area (bc_mask): {reference_area_physical:.6f} m^2")
+print("\n" + "=" * 50 + "\n")
+
+# -------------------------- Simulation Loop --------------------------
+wp.synchronize()
+start_time = time.time()
+compute_time = 0.0
+steps_since_last_print = 0
+drag_values = []
+
+for step in range(num_steps):
+    step_start = time.time()
+    sim.step()
+    wp.synchronize()
+    compute_time += time.time() - step_start
+    steps_since_last_print += 1
+    if step % print_interval == 0 or step == num_steps - 1:
+        sim.macro(sim.f_0, sim.bc_mask, sim.rho, sim.u, streamId=0)
+        wp.synchronize()
+        cd, cl, drag = print_lift_drag(sim, step, momentum_transfer, wind_speed_lbm, reference_area)
+        filename = os.path.join(output_dir, f"{script_name}_{step:04d}")
+        h5exporter.to_hdf5(filename, {"velocity": sim.u, "density": sim.rho}, compression="gzip", compression_opts=0)
+        h5exporter.to_slice_image(
+            filename,
+            {"velocity": sim.u},
+            plane_point=(1, 0, 0),
+            plane_normal=(0, 1, 0),
+            grid_res=2000,
+            bounds=(0.25, 0.75, 0, 0.5),
+            show_axes=False,
+            show_colorbar=False,
+            slice_thickness=delta_x_coarse,  # needed when using model units
+        )
+        end_time = time.time()
+        elapsed = end_time - start_time
+        total_lattice_updates = total_lattice_updates_per_step * steps_since_last_print
+        MLUPS = total_lattice_updates / compute_time / 1e6 if compute_time > 0 else 0.0
+        current_flow_passes = step * wind_speed_lbm / grid_shape_x_coarsest
+        remaining_steps = num_steps - step - 1
+        time_remaining = 0.0 if MLUPS == 0 else (total_lattice_updates_per_step * remaining_steps) / (MLUPS * 1e6)
+        hours, rem = divmod(time_remaining, 3600)
+        minutes, seconds = divmod(rem, 60)
+        time_remaining_str = f"{int(hours):02d}h {int(minutes):02d}m {int(seconds):02d}s"
+        percent_complete = (step + 1) / num_steps * 100
+        print(f"Completed step {step}/{num_steps} ({percent_complete:.2f}% complete)")
+        print(f"  Flow Passes: {current_flow_passes:.2f}")
+        print(f"  Time elapsed: {elapsed:.1f}s, Compute time: {compute_time:.1f}s, ETA: {time_remaining_str}")
+        print(f"  MLUPS: {MLUPS:.1f}")
+        print(f"  Cd={cd:.3f}, Cl={cl:.3f}, Drag Force (lattice units)={drag:.3f}")
+        start_time = time.time()
+        compute_time = 0.0
+        steps_since_last_print = 0
+    file_output_interval = file_output_interval_pre_crossover if step < crossover_step else file_output_interval_post_crossover
+    if step % file_output_interval == 0 or step == num_steps - 1:
+        sim.macro(sim.f_0, sim.bc_mask, sim.rho, sim.u, streamId=0)
+        filename = os.path.join(output_dir, f"{script_name}_{step:04d}")
+        try:
+            h5exporter.to_hdf5(filename, {"velocity": sim.u, "density": sim.rho}, compression="gzip", compression_opts=0)
+            xmf_filename = f"{filename}.xmf"
+            hdf5_basename = f"{script_name}_{step:04d}.h5"
+        except Exception as e:
+            print(f"Error during file output at step {step}: {e}")
+        wp.synchronize()
+    if step == num_steps - 1:
+        plot_data(x0, output_dir, delta_x_coarse, sim, h5exporter, prefix="Ahmed")
+
+# Save drag and lift data to CSV
+if len(drag_values) > 0:
+    with open(os.path.join(output_dir, "drag_lift.csv"), "w") as fd:
+        fd.write("Step,Cd,Cl\n")
+        for i, (cd, cl) in enumerate(drag_values):
+            fd.write(f"{i * print_interval},{cd},{cl}\n")
+    plot_drag_lift(drag_values, output_dir, print_interval, script_name)
+
+# Calculate and print average Cd and Cl for the last 50%
+drag_values_array = np.array(drag_values)
+if len(drag_values) > 0:
+    start_index = len(drag_values) // 2
+    last_half = drag_values_array[start_index:, :]
+    avg_cd = np.mean(last_half[:, 0])
+    avg_cl = np.mean(last_half[:, 1])
+    print(f"Average Drag Coefficient (Cd) for last 50%: {avg_cd:.6f}")
+    print(f"Average Lift Coefficient (Cl) for last 50%: {avg_cl:.6f}")
+    print(f"Experimental Drag Coefficient (Cd): {0.3088}")
+    print(f"Error Drag Coefficient (Cd): {((avg_cd - 0.3088) / 0.3088) * 100:.2f}%")
+
+else:
+    print("No drag or lift data collected.")
diff --git a/examples/cfd/rotating_sphere_3d.py b/examples/cfd/rotating_sphere_3d.py
index 450edbbd..235345b3 100644
--- a/examples/cfd/rotating_sphere_3d.py
+++ b/examples/cfd/rotating_sphere_3d.py
@@ -1,317 +1,317 @@
-import xlb
-import trimesh
-import time
-import warp as wp
-import numpy as np
-import jax.numpy as jnp
-from typing import Any
-
-from xlb.compute_backend import ComputeBackend
-from xlb.precision_policy import PrecisionPolicy
-from xlb.grid import grid_factory
-from xlb.operator.stepper import IncompressibleNavierStokesStepper
-from xlb.operator.boundary_condition import (
-    HalfwayBounceBackBC,
-    FullwayBounceBackBC,
-    RegularizedBC,
-    DoNothingBC,
-    HybridBC,
-)
-from xlb.operator.force.momentum_transfer import MomentumTransfer
-from xlb.operator.macroscopic import Macroscopic
-from xlb.utils import save_fields_vtk, save_image
-import matplotlib.pyplot as plt
-from xlb.operator.equilibrium import QuadraticEquilibrium
-from xlb.operator import Operator
-from xlb.velocity_set.velocity_set import VelocitySet
-from xlb.operator.boundary_masker import MeshVoxelizationMethod
-
-# -------------------------- Simulation Setup --------------------------
-
-# Grid parameters
-wp.clear_kernel_cache()
-diam = 32
-grid_size_x, grid_size_y, grid_size_z = 10 * diam, 7 * diam, 7 * diam
-grid_shape = (grid_size_x, grid_size_y, grid_size_z)
-
-# Simulation Configuration
-compute_backend = ComputeBackend.WARP
-precision_policy = PrecisionPolicy.FP32FP32
-
-velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=compute_backend)
-wind_speed = 0.04
-num_steps = 100000
-print_interval = 1000
-post_process_interval = 1000
-
-# Physical Parameters
-Re = 200.0
-visc = wind_speed * diam / Re
-omega = 1.0 / (3.0 * visc + 0.5)
-
-# Rotational speed parameters (see [1] which discusses the problem in terms of 2 non-dimensional parameters: Re and Omega)
-# [1] J. Fluid Mech. (2016), vol. 807, pp. 62–86. c© Cambridge University Press 2016 doi:10.1017/jfm.2016.596
-# \Omega = \omega * D / (2 U_\infty) where Omega is non-dimensional and omega is dimensional.
-rot_rate_nondim = -0.2
-rot_rate = 2.0 * wind_speed * rot_rate_nondim / diam
-
-# Print simulation info
-print("\n" + "=" * 50 + "\n")
-print("Simulation Configuration:")
-print(f"Grid size: {grid_size_x} x {grid_size_y} x {grid_size_z}")
-print(f"Backend: {compute_backend}")
-print(f"Velocity set: {velocity_set}")
-print(f"Precision policy: {precision_policy}")
-print(f"Prescribed velocity: {wind_speed}")
-print(f"Reynolds number: {Re}")
-print(f"Max iterations: {num_steps}")
-print("\n" + "=" * 50 + "\n")
-
-# Initialize XLB
-xlb.init(
-    velocity_set=velocity_set,
-    default_backend=compute_backend,
-    default_precision_policy=precision_policy,
-)
-
-# Create Grid
-grid = grid_factory(grid_shape, compute_backend=compute_backend)
-
-# Bounding box indices
-box = grid.bounding_box_indices()
-box_no_edge = grid.bounding_box_indices(remove_edges=True)
-inlet = box_no_edge["left"]
-outlet = box["right"]
-walls = [box["bottom"][i] + box["top"][i] + box["front"][i] + box["back"][i] for i in range(velocity_set.d)]
-walls = np.unique(np.array(walls), axis=-1).tolist()
-
-# Load the mesh (replace with your own mesh)
-stl_filename = "examples/cfd/stl-files/sphere.stl"
-mesh = trimesh.load_mesh(stl_filename, process=False)
-mesh_vertices = mesh.vertices
-
-# Transform the mesh points to be located in the right position in the wind tunnel
-mesh_vertices -= mesh_vertices.min(axis=0)
-mesh_extents = mesh_vertices.max(axis=0)
-length_phys_unit = mesh_extents.max()
-length_lbm_unit = grid_shape[1] / 7
-dx = length_phys_unit / length_lbm_unit
-mesh_vertices = mesh_vertices / dx
-shift = np.array([grid_shape[0] / 3, (grid_shape[1] - mesh_extents[1] / dx) / 2, (grid_shape[2] - mesh_extents[2] / dx) / 2])
-sphere = mesh_vertices + shift
-diam = np.max(sphere.max(axis=0) - sphere.min(axis=0))
-sphere_cross_section = np.pi * diam**2 / 4.0
-
-
-# Define rotating boundary profile
-def bc_profile():
-    dtype = precision_policy.compute_precision.wp_dtype
-    _u_vec = wp.vec(velocity_set.d, dtype=dtype)
-    angular_velocity = _u_vec(0.0, rot_rate, 0.0)
-    origin_np = shift + diam / 2
-    origin_wp = _u_vec(origin_np[0], origin_np[1], origin_np[2])
-
-    @wp.func
-    def bc_profile_warp(index: wp.vec3i):
-        x = dtype(index[0])
-        y = dtype(index[1])
-        z = dtype(index[2])
-        surface_coord = _u_vec(x, y, z) - origin_wp
-        return wp.cross(angular_velocity, surface_coord)
-
-    return bc_profile_warp
-
-
-# Define boundary conditions
-bc_left = RegularizedBC("velocity", prescribed_value=(wind_speed, 0.0, 0.0), indices=inlet)
-bc_do_nothing = DoNothingBC(indices=outlet)
-# bc_sphere = HalfwayBounceBackBC(mesh_vertices=sphere, voxelization_method="ray", profile=bc_profile())
-bc_sphere = HybridBC(
-    bc_method="nonequilibrium_regularized",
-    mesh_vertices=sphere,
-    use_mesh_distance=True,
-    voxelization_method=MeshVoxelizationMethod("RAY"),
-    profile=bc_profile(),
-)
-# Not assining BC for walls makes them periodic.
-boundary_conditions = [bc_left, bc_do_nothing, bc_sphere]
-
-
-# Setup Stepper
-stepper = IncompressibleNavierStokesStepper(
-    grid=grid,
-    boundary_conditions=boundary_conditions,
-    collision_type="KBC",
-)
-
-# Make initializer operator
-from xlb.helper.initializers import CustomInitializer
-
-initializer = CustomInitializer(
-    bc_id=bc_do_nothing.id,
-    constant_velocity_vector=(wind_speed, 0.0, 0.0),
-    velocity_set=velocity_set,
-    precision_policy=precision_policy,
-    compute_backend=compute_backend,
-)
-
-# Prepare Fields
-f_0, f_1, bc_mask, missing_mask = stepper.prepare_fields(initializer=initializer)
-
-
-# -------------------------- Helper Functions --------------------------
-
-
-def plot_coefficient(time_steps, coefficients, prefix="drag"):
-    """
-    Plot the drag coefficient with various moving averages.
-
-    Args:
-        time_steps (list): List of time steps.
-        coefficients (list): List of force coefficients.
-    """
-    # Convert lists to numpy arrays for processing
-    time_steps_np = np.array(time_steps)
-    coefficients_np = np.array(coefficients)
-
-    # Define moving average windows
-    windows = [10, 100, 1000, 10000, 100000]
-    labels = ["MA 10", "MA 100", "MA 1,000", "MA 10,000", "MA 100,000"]
-
-    plt.figure(figsize=(12, 8))
-    plt.plot(time_steps_np, coefficients_np, label="Raw", alpha=0.5)
-
-    for window, label in zip(windows, labels):
-        if len(coefficients_np) >= window:
-            ma = np.convolve(coefficients_np, np.ones(window) / window, mode="valid")
-            plt.plot(time_steps_np[window - 1 :], ma, label=label)
-
-    plt.ylim(-1.0, 1.0)
-    plt.legend()
-    plt.xlabel("Time step")
-    plt.ylabel("Drag coefficient")
-    plt.title("Drag Coefficient Over Time with Moving Averages")
-    plt.savefig(prefix + "_ma.png")
-    plt.close()
-
-
-def post_process(
-    step,
-    f_0,
-    f_1,
-    grid_shape,
-    macro,
-    momentum_transfer,
-    missing_mask,
-    bc_mask,
-    wind_speed,
-    car_cross_section,
-    drag_coefficients,
-    lift_coefficients,
-    time_steps,
-):
-    """
-    Post-process simulation data: save fields, compute forces, and plot drag coefficient.
-
-    Args:
-        step (int): Current time step.
-        f_current: Current distribution function.
-        grid_shape (tuple): Shape of the grid.
-        macro: Macroscopic operator object.
-        momentum_transfer: MomentumTransfer operator object.
-        missing_mask: Missing mask from stepper.
-        bc_mask: Boundary condition mask from stepper.
-        wind_speed (float): Prescribed wind speed.
-        car_cross_section (float): Cross-sectional area of the car.
-        drag_coefficients (list): List to store drag coefficients.
-        lift_coefficients (list): List to store lift coefficients.
-        time_steps (list): List to store time steps.
-    """
-    wp.synchronize()
-    # Convert to JAX array if necessary
-    if not isinstance(f_0, jnp.ndarray):
-        f_0_jax = wp.to_jax(f_0)
-    else:
-        f_0_jax = f_0
-
-    # Compute macroscopic quantities
-    rho, u = macro(f_0_jax)
-
-    # Remove boundary cells
-    u = u[:, 1:-1, 1:-1, 1:-1]
-    u_magnitude = jnp.sqrt(u[0] ** 2 + u[1] ** 2 + u[2] ** 2)
-
-    fields = {"ux": u[0], "uy": u[1], "uz": u[2], "u_magnitude": u_magnitude}
-
-    # Save fields in VTK format
-    # save_fields_vtk(fields, timestep=step)
-
-    # Save the u_magnitude slice at the mid y-plane
-    mid_y = grid_shape[1] // 2
-    save_image(fields["u_magnitude"][:, mid_y, :], timestep=step)
-
-    # Compute lift and drag
-    boundary_force = momentum_transfer(f_0, f_1, bc_mask, missing_mask)
-    drag = boundary_force[0]  # x-direction
-    lift = boundary_force[2]
-    cd = 2.0 * drag / (wind_speed**2 * car_cross_section)
-    cl = 2.0 * lift / (wind_speed**2 * car_cross_section)
-    print(f"CD={cd}, CL={cl}")
-    drag_coefficients.append(cd)
-    lift_coefficients.append(cl)
-    time_steps.append(step)
-
-    # Plot drag coefficient
-    plot_coefficient(time_steps, drag_coefficients, prefix="drag")
-    plot_coefficient(time_steps, lift_coefficients, prefix="lift")
-
-
-# Setup Momentum Transfer for Force Calculation
-bc_car = boundary_conditions[-1]
-momentum_transfer = MomentumTransfer(bc_car, compute_backend=compute_backend)
-
-# Define Macroscopic Calculation
-macro = Macroscopic(
-    compute_backend=ComputeBackend.JAX,
-    precision_policy=precision_policy,
-    velocity_set=xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=ComputeBackend.JAX),
-)
-
-# Initialize Lists to Store Coefficients and Time Steps
-time_steps = []
-drag_coefficients = []
-lift_coefficients = []
-
-# -------------------------- Simulation Loop --------------------------
-
-start_time = time.time()
-for step in range(num_steps):
-    # Perform simulation step
-    f_0, f_1 = stepper(f_0, f_1, bc_mask, missing_mask, omega, step)
-    f_0, f_1 = f_1, f_0  # Swap the buffers
-
-    # Print progress at intervals
-    if step % print_interval == 0:
-        elapsed_time = time.time() - start_time
-        print(f"Iteration: {step}/{num_steps} | Time elapsed: {elapsed_time:.2f}s")
-        start_time = time.time()
-
-    # Post-process at intervals and final step
-    if (step % post_process_interval == 0) or (step == num_steps - 1):
-        post_process(
-            step,
-            f_0,
-            f_1,
-            grid_shape,
-            macro,
-            momentum_transfer,
-            missing_mask,
-            bc_mask,
-            wind_speed,
-            sphere_cross_section,
-            drag_coefficients,
-            lift_coefficients,
-            time_steps,
-        )
-
-print("Simulation completed successfully.")
+import xlb
+import trimesh
+import time
+import warp as wp
+import numpy as np
+import jax.numpy as jnp
+from typing import Any
+
+from xlb.compute_backend import ComputeBackend
+from xlb.precision_policy import PrecisionPolicy
+from xlb.grid import grid_factory
+from xlb.operator.stepper import IncompressibleNavierStokesStepper
+from xlb.operator.boundary_condition import (
+    HalfwayBounceBackBC,
+    FullwayBounceBackBC,
+    RegularizedBC,
+    DoNothingBC,
+    HybridBC,
+)
+from xlb.operator.force.momentum_transfer import MomentumTransfer
+from xlb.operator.macroscopic import Macroscopic
+from xlb.utils import save_fields_vtk, save_image
+import matplotlib.pyplot as plt
+from xlb.operator.equilibrium import QuadraticEquilibrium
+from xlb.operator import Operator
+from xlb.velocity_set.velocity_set import VelocitySet
+from xlb.operator.boundary_masker import MeshVoxelizationMethod
+
+# -------------------------- Simulation Setup --------------------------
+
+# Grid parameters
+wp.clear_kernel_cache()
+diam = 32
+grid_size_x, grid_size_y, grid_size_z = 10 * diam, 7 * diam, 7 * diam
+grid_shape = (grid_size_x, grid_size_y, grid_size_z)
+
+# Simulation Configuration
+compute_backend = ComputeBackend.WARP
+precision_policy = PrecisionPolicy.FP32FP32
+
+velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=compute_backend)
+wind_speed = 0.04
+num_steps = 100000
+print_interval = 1000
+post_process_interval = 1000
+
+# Physical Parameters
+Re = 200.0
+visc = wind_speed * diam / Re
+omega = 1.0 / (3.0 * visc + 0.5)
+
+# Rotational speed parameters (see [1] which discusses the problem in terms of 2 non-dimensional parameters: Re and Omega)
+# [1] J. Fluid Mech. (2016), vol. 807, pp. 62–86. c© Cambridge University Press 2016 doi:10.1017/jfm.2016.596
+# \Omega = \omega * D / (2 U_\infty) where Omega is non-dimensional and omega is dimensional.
+rot_rate_nondim = -0.2
+rot_rate = 2.0 * wind_speed * rot_rate_nondim / diam
+
+# Print simulation info
+print("\n" + "=" * 50 + "\n")
+print("Simulation Configuration:")
+print(f"Grid size: {grid_size_x} x {grid_size_y} x {grid_size_z}")
+print(f"Backend: {compute_backend}")
+print(f"Velocity set: {velocity_set}")
+print(f"Precision policy: {precision_policy}")
+print(f"Prescribed velocity: {wind_speed}")
+print(f"Reynolds number: {Re}")
+print(f"Max iterations: {num_steps}")
+print("\n" + "=" * 50 + "\n")
+
+# Initialize XLB
+xlb.init(
+    velocity_set=velocity_set,
+    default_backend=compute_backend,
+    default_precision_policy=precision_policy,
+)
+
+# Create Grid
+grid = grid_factory(grid_shape, compute_backend=compute_backend)
+
+# Bounding box indices
+box = grid.bounding_box_indices()
+box_no_edge = grid.bounding_box_indices(remove_edges=True)
+inlet = box_no_edge["left"]
+outlet = box["right"]
+walls = [box["bottom"][i] + box["top"][i] + box["front"][i] + box["back"][i] for i in range(velocity_set.d)]
+walls = np.unique(np.array(walls), axis=-1).tolist()
+
+# Load the mesh (replace with your own mesh)
+stl_filename = "examples/cfd/stl-files/sphere.stl"
+mesh = trimesh.load_mesh(stl_filename, process=False)
+mesh_vertices = mesh.vertices
+
+# Transform the mesh points to be located in the right position in the wind tunnel
+mesh_vertices -= mesh_vertices.min(axis=0)
+mesh_extents = mesh_vertices.max(axis=0)
+length_phys_unit = mesh_extents.max()
+length_lbm_unit = grid_shape[1] / 7
+dx = length_phys_unit / length_lbm_unit
+mesh_vertices = mesh_vertices / dx
+shift = np.array([grid_shape[0] / 3, (grid_shape[1] - mesh_extents[1] / dx) / 2, (grid_shape[2] - mesh_extents[2] / dx) / 2])
+sphere = mesh_vertices + shift
+diam = np.max(sphere.max(axis=0) - sphere.min(axis=0))
+sphere_cross_section = np.pi * diam**2 / 4.0
+
+
+# Define rotating boundary profile
+def bc_profile():
+    dtype = precision_policy.compute_precision.wp_dtype
+    _u_vec = wp.vec(velocity_set.d, dtype=dtype)
+    angular_velocity = _u_vec(0.0, rot_rate, 0.0)
+    origin_np = shift + diam / 2
+    origin_wp = _u_vec(origin_np[0], origin_np[1], origin_np[2])
+
+    @wp.func
+    def bc_profile_warp(index: wp.vec3i):
+        x = dtype(index[0])
+        y = dtype(index[1])
+        z = dtype(index[2])
+        surface_coord = _u_vec(x, y, z) - origin_wp
+        return wp.cross(angular_velocity, surface_coord)
+
+    return bc_profile_warp
+
+
+# Define boundary conditions
+bc_left = RegularizedBC("velocity", prescribed_value=(wind_speed, 0.0, 0.0), indices=inlet)
+bc_do_nothing = DoNothingBC(indices=outlet)
+# bc_sphere = HalfwayBounceBackBC(mesh_vertices=sphere, voxelization_method="ray", profile=bc_profile())
+bc_sphere = HybridBC(
+    bc_method="nonequilibrium_regularized",
+    mesh_vertices=sphere,
+    use_mesh_distance=True,
+    voxelization_method=MeshVoxelizationMethod("RAY"),
+    profile=bc_profile(),
+)
+# Not assining BC for walls makes them periodic.
+boundary_conditions = [bc_left, bc_do_nothing, bc_sphere]
+
+
+# Setup Stepper
+stepper = IncompressibleNavierStokesStepper(
+    grid=grid,
+    boundary_conditions=boundary_conditions,
+    collision_type="KBC",
+)
+
+# Make initializer operator
+from xlb.helper.initializers import CustomInitializer
+
+initializer = CustomInitializer(
+    bc_id=bc_do_nothing.id,
+    constant_velocity_vector=(wind_speed, 0.0, 0.0),
+    velocity_set=velocity_set,
+    precision_policy=precision_policy,
+    compute_backend=compute_backend,
+)
+
+# Prepare Fields
+f_0, f_1, bc_mask, missing_mask = stepper.prepare_fields(initializer=initializer)
+
+
+# -------------------------- Helper Functions --------------------------
+
+
+def plot_coefficient(time_steps, coefficients, prefix="drag"):
+    """
+    Plot the drag coefficient with various moving averages.
+
+    Args:
+        time_steps (list): List of time steps.
+        coefficients (list): List of force coefficients.
+    """
+    # Convert lists to numpy arrays for processing
+    time_steps_np = np.array(time_steps)
+    coefficients_np = np.array(coefficients)
+
+    # Define moving average windows
+    windows = [10, 100, 1000, 10000, 100000]
+    labels = ["MA 10", "MA 100", "MA 1,000", "MA 10,000", "MA 100,000"]
+
+    plt.figure(figsize=(12, 8))
+    plt.plot(time_steps_np, coefficients_np, label="Raw", alpha=0.5)
+
+    for window, label in zip(windows, labels):
+        if len(coefficients_np) >= window:
+            ma = np.convolve(coefficients_np, np.ones(window) / window, mode="valid")
+            plt.plot(time_steps_np[window - 1 :], ma, label=label)
+
+    plt.ylim(-1.0, 1.0)
+    plt.legend()
+    plt.xlabel("Time step")
+    plt.ylabel("Drag coefficient")
+    plt.title("Drag Coefficient Over Time with Moving Averages")
+    plt.savefig(prefix + "_ma.png")
+    plt.close()
+
+
+def post_process(
+    step,
+    f_0,
+    f_1,
+    grid_shape,
+    macro,
+    momentum_transfer,
+    missing_mask,
+    bc_mask,
+    wind_speed,
+    car_cross_section,
+    drag_coefficients,
+    lift_coefficients,
+    time_steps,
+):
+    """
+    Post-process simulation data: save fields, compute forces, and plot drag coefficient.
+
+    Args:
+        step (int): Current time step.
+        f_current: Current distribution function.
+        grid_shape (tuple): Shape of the grid.
+        macro: Macroscopic operator object.
+        momentum_transfer: MomentumTransfer operator object.
+        missing_mask: Missing mask from stepper.
+        bc_mask: Boundary condition mask from stepper.
+        wind_speed (float): Prescribed wind speed.
+        car_cross_section (float): Cross-sectional area of the car.
+        drag_coefficients (list): List to store drag coefficients.
+        lift_coefficients (list): List to store lift coefficients.
+        time_steps (list): List to store time steps.
+    """
+    wp.synchronize()
+    # Convert to JAX array if necessary
+    if not isinstance(f_0, jnp.ndarray):
+        f_0_jax = wp.to_jax(f_0)
+    else:
+        f_0_jax = f_0
+
+    # Compute macroscopic quantities
+    rho, u = macro(f_0_jax)
+
+    # Remove boundary cells
+    u = u[:, 1:-1, 1:-1, 1:-1]
+    u_magnitude = jnp.sqrt(u[0] ** 2 + u[1] ** 2 + u[2] ** 2)
+
+    fields = {"ux": u[0], "uy": u[1], "uz": u[2], "u_magnitude": u_magnitude}
+
+    # Save fields in VTK format
+    # save_fields_vtk(fields, timestep=step)
+
+    # Save the u_magnitude slice at the mid y-plane
+    mid_y = grid_shape[1] // 2
+    save_image(fields["u_magnitude"][:, mid_y, :], timestep=step)
+
+    # Compute lift and drag
+    boundary_force = momentum_transfer(f_0, f_1, bc_mask, missing_mask)
+    drag = boundary_force[0]  # x-direction
+    lift = boundary_force[2]
+    cd = 2.0 * drag / (wind_speed**2 * car_cross_section)
+    cl = 2.0 * lift / (wind_speed**2 * car_cross_section)
+    print(f"CD={cd}, CL={cl}")
+    drag_coefficients.append(cd)
+    lift_coefficients.append(cl)
+    time_steps.append(step)
+
+    # Plot drag coefficient
+    plot_coefficient(time_steps, drag_coefficients, prefix="drag")
+    plot_coefficient(time_steps, lift_coefficients, prefix="lift")
+
+
+# Setup Momentum Transfer for Force Calculation
+bc_car = boundary_conditions[-1]
+momentum_transfer = MomentumTransfer(bc_car, compute_backend=compute_backend)
+
+# Define Macroscopic Calculation
+macro = Macroscopic(
+    compute_backend=ComputeBackend.JAX,
+    precision_policy=precision_policy,
+    velocity_set=xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=ComputeBackend.JAX),
+)
+
+# Initialize Lists to Store Coefficients and Time Steps
+time_steps = []
+drag_coefficients = []
+lift_coefficients = []
+
+# -------------------------- Simulation Loop --------------------------
+
+start_time = time.time()
+for step in range(num_steps):
+    # Perform simulation step
+    f_0, f_1 = stepper(f_0, f_1, bc_mask, missing_mask, omega, step)
+    f_0, f_1 = f_1, f_0  # Swap the buffers
+
+    # Print progress at intervals
+    if step % print_interval == 0:
+        elapsed_time = time.time() - start_time
+        print(f"Iteration: {step}/{num_steps} | Time elapsed: {elapsed_time:.2f}s")
+        start_time = time.time()
+
+    # Post-process at intervals and final step
+    if (step % post_process_interval == 0) or (step == num_steps - 1):
+        post_process(
+            step,
+            f_0,
+            f_1,
+            grid_shape,
+            macro,
+            momentum_transfer,
+            missing_mask,
+            bc_mask,
+            wind_speed,
+            sphere_cross_section,
+            drag_coefficients,
+            lift_coefficients,
+            time_steps,
+        )
+
+print("Simulation completed successfully.")
diff --git a/tests/kernels/collision/test_bgk_collision_jax.py b/tests/kernels/collision/test_bgk_collision_jax.py
index 91143932..3d6ea901 100644
--- a/tests/kernels/collision/test_bgk_collision_jax.py
+++ b/tests/kernels/collision/test_bgk_collision_jax.py
@@ -28,7 +28,7 @@ def init_xlb_env(velocity_set):
         (3, xlb.velocity_set.D3Q27, (50, 50, 50), 1.0),
     ],
 )
-def test_bgk_ollision(dim, velocity_set, grid_shape, omega):
+def test_bgk_collision(dim, velocity_set, grid_shape, omega):
     init_xlb_env(velocity_set)
     my_grid = grid_factory(grid_shape)
 
diff --git a/xlb/cell_type.py b/xlb/cell_type.py
new file mode 100644
index 00000000..ea082e10
--- /dev/null
+++ b/xlb/cell_type.py
@@ -0,0 +1,11 @@
+# Boundary-mask constants for the bc_mask field.
+# Each voxel in the domain carries a uint8 tag in bc_mask that encodes its role:
+#   BC_NONE  — regular fluid voxel (no boundary condition)
+#   BC_SFV   — Simple Fluid Voxel: fluid cell not involved in any BC,
+#              explosion, or coalescence (used for fast-path kernels)
+#   BC_SOLID — solid / obstacle voxel (skipped by all LBM operators)
+# Registered boundary conditions receive IDs in the range [1, 253].
+
+BC_NONE = 0
+BC_SFV = 254
+BC_SOLID = 255
diff --git a/xlb/helper/simulation_manager.py b/xlb/helper/simulation_manager.py
index 36a9974d..53f8a335 100644
--- a/xlb/helper/simulation_manager.py
+++ b/xlb/helper/simulation_manager.py
@@ -96,20 +96,16 @@ def recursion_reference(level, app):
             if level < 0:
                 return
 
-            # Compute omega at the current level
             omega = self.omega_list[level]
 
-            print(f"RECURSION down to level {level}")
-            print(f"RECURSION Level {level}, COLLIDE")
-
             self.add_to_app(
                 app=app,
                 op_name="collide_coarse",
                 level=level,
-                f_0=self.f_0,
-                f_1=self.f_1,
-                bc_mask=self.bc_mask,
-                missing_mask=self.missing_mask,
+                f_0_fd=self.f_0,
+                f_1_fd=self.f_1,
+                bc_mask_fd=self.bc_mask,
+                missing_mask_fd=self.missing_mask,
                 omega=omega,
                 timestep=0,
             )
@@ -117,8 +113,7 @@ def recursion_reference(level, app):
             recursion_reference(level - 1, app)
             recursion_reference(level - 1, app)
 
-            # Important: swapping of f_0 and f_1 is done here
-            print(f"RECURSION Level {level}, stream_coarse_step_ABC")
+            # Swapping of f_0 and f_1
             self.add_to_app(
                 app=app,
                 op_name="stream_coarse_step_ABC",
@@ -135,12 +130,9 @@ def recursion_fused_finest(level, app):
             if level < 0:
                 return
 
-            # Compute omega at the current level
             omega = self.omega_list[level]
 
             if level == 0:
-                print(f"RECURSION down to the finest level {level}")
-                print(f"RECURSION Level {level}, Fused STREAM and COLLIDE")
                 self.add_to_app(
                     app=app,
                     op_name="finest_fused_pull",
@@ -167,9 +159,6 @@ def recursion_fused_finest(level, app):
                 )
                 return
 
-            print(f"RECURSION down to level {level}")
-            print(f"RECURSION Level {level}, COLLIDE")
-
             self.add_to_app(
                 app=app,
                 op_name="collide_coarse",
@@ -181,18 +170,13 @@ def recursion_fused_finest(level, app):
                 omega=omega,
                 timestep=0,
             )
-            # 1. Accumulation is read from f_0 in the streaming step, where f_0=self.f_1.
-            # so is_self_f1_the_coalescence_dst_field is True
-            # 2. Explision data is the output from the corser collide, which is f_1=self.f_1.
-            # so is_self_f1_the_explosion_src_field is True
 
             if level - 1 == 0:
                 recursion_fused_finest(level - 1, app)
             else:
                 recursion_fused_finest(level - 1, app)
                 recursion_fused_finest(level - 1, app)
-            # Important: swapping of f_0 and f_1 is done here
-            print(f"RECURSION Level {level}, stream_coarse_step_ABC")
+            # Swapping of f_0 and f_1
             self.add_to_app(
                 app=app,
                 op_name="stream_coarse_step_ABC",
@@ -205,16 +189,13 @@ def recursion_fused_finest(level, app):
                 timestep=0,
             )
 
-        def recursion_fused_finest_254(level, app):
+        def recursion_fused_finest_SFV(level, app):
             if level < 0:
                 return
 
-            # Compute omega at the current level
             omega = self.omega_list[level]
 
             if level == 0:
-                print(f"RECURSION down to the finest level {level}")
-                print(f"RECURSION Level {level}, Fused STREAM and COLLIDE")
                 self.add_to_app(
                     app=app,
                     op_name="CFV_finest_fused_pull",
@@ -261,9 +242,6 @@ def recursion_fused_finest_254(level, app):
                 )
                 return
 
-            print(f"RECURSION down to level {level}")
-            print(f"RECURSION Level {level}, COLLIDE")
-
             self.add_to_app(
                 app=app,
                 op_name="collide_coarse",
@@ -275,18 +253,13 @@ def recursion_fused_finest_254(level, app):
                 omega=omega,
                 timestep=0,
             )
-            # 1. Accumulation is read from f_0 in the streaming step, where f_0=self.f_1.
-            # so is_self_f1_the_coalescence_dst_field is True
-            # 2. Explision data is the output from the corser collide, which is f_1=self.f_1.
-            # so is_self_f1_the_explosion_src_field is True
 
             if level - 1 == 0:
-                recursion_fused_finest_254(level - 1, app)
+                recursion_fused_finest_SFV(level - 1, app)
             else:
-                recursion_fused_finest_254(level - 1, app)
-                recursion_fused_finest_254(level - 1, app)
-            # Important: swapping of f_0 and f_1 is done here
-            print(f"RECURSION Level {level}, stream_coarse_step_ABC")
+                recursion_fused_finest_SFV(level - 1, app)
+                recursion_fused_finest_SFV(level - 1, app)
+            # Swapping of f_0 and f_1
             self.add_to_app(
                 app=app,
                 op_name="stream_coarse_step_ABC",
@@ -299,16 +272,13 @@ def recursion_fused_finest_254(level, app):
                 timestep=0,
             )
 
-        def recursion_fused_finest_254_all(level, app):
+        def recursion_fused_finest_SFV_all(level, app):
             if level < 0:
                 return
 
-            # Compute omega at the current level
             omega = self.omega_list[level]
 
             if level == 0:
-                print(f"RECURSION down to the finest level {level}")
-                print(f"RECURSION Level {level}, Fused STREAM and COLLIDE")
                 self.add_to_app(
                     app=app,
                     op_name="CFV_finest_fused_pull",
@@ -355,9 +325,6 @@ def recursion_fused_finest_254_all(level, app):
                 )
                 return
 
-            print(f"RECURSION down to level {level}")
-            print(f"RECURSION Level {level}, COLLIDE")
-
             self.add_to_app(
                 app=app,
                 op_name="CFV_collide_coarse",
@@ -380,18 +347,13 @@ def recursion_fused_finest_254_all(level, app):
                 omega=omega,
                 timestep=0,
             )
-            # 1. Accumulation is read from f_0 in the streaming step, where f_0=self.f_1.
-            # so is_self_f1_the_coalescence_dst_field is True
-            # 2. Explision data is the output from the corser collide, which is f_1=self.f_1.
-            # so is_self_f1_the_explosion_src_field is True
 
             if level - 1 == 0:
-                recursion_fused_finest_254_all(level - 1, app)
+                recursion_fused_finest_SFV_all(level - 1, app)
             else:
-                recursion_fused_finest_254_all(level - 1, app)
-                recursion_fused_finest_254_all(level - 1, app)
-            # Important: swapping of f_0 and f_1 is done here
-            print(f"RECURSION Level {level}, stream_coarse_step_ABC")
+                recursion_fused_finest_SFV_all(level - 1, app)
+                recursion_fused_finest_SFV_all(level - 1, app)
+            # Swapping of f_0 and f_1
             self.add_to_app(
                 app=app,
                 op_name="SFV_stream_coarse_step_ABC",
@@ -412,42 +374,25 @@ def recursion_fused_finest_254_all(level, app):
                 bc_mask_fd=self.bc_mask,
                 missing_mask_fd=self.missing_mask,
             )
-            return
 
         if self.mres_perf_opt == MresPerfOptimizationType.NAIVE_COLLIDE_STREAM:
             recursion_reference(self.count_levels - 1, app=self.app)
         elif self.mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST:
             recursion_fused_finest(self.count_levels - 1, app=self.app)
-        elif self.mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST_254:
-            # Run kernel that generates teh 254 value in the bc_mask
+        elif self.mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST_SFV:
             wp.synchronize()
-            # self.bc_mask.update_host(0)
-            # wp.synchronize()
-            # self.bc_mask.export_vti(f"mask_before.vti", "u")
-
             self.neon_container["SFV_reset_bc_mask"](0, self.f_0, self.f_1, self.bc_mask, self.bc_mask).run(0)
             wp.synchronize()
-            # self.bc_mask.update_host(0)
-            # wp.synchronize()
-            # self.bc_mask.export_vti(f"mask_after.vti", "u")
-            recursion_fused_finest_254(self.count_levels - 1, app=self.app)
-        elif self.mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST_254_ALL:
-            # Run kernel that generates teh 254 value in the bc_mask
+            recursion_fused_finest_SFV(self.count_levels - 1, app=self.app)
+        elif self.mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST_SFV_ALL:
             wp.synchronize()
-            # self.bc_mask.update_host(0)
-            # wp.synchronize()
-            # self.bc_mask.export_vti(f"mask_before.vti", "u")
-
             num_levels = self.f_0.get_grid().num_levels
             for l in range(num_levels):
                 self.neon_container["SFV_reset_bc_mask"](l, self.f_0, self.f_1, self.bc_mask, self.bc_mask).run(0)
-            # wp.synchronize()
-            # self.bc_mask.update_host(0)
             wp.synchronize()
-            # self.bc_mask.export_vti(f"mask_after.vti", "u")
-            recursion_fused_finest_254_all(self.count_levels - 1, app=self.app)
+            recursion_fused_finest_SFV_all(self.count_levels - 1, app=self.app)
         else:
-            raise ValueError(f"Unknown optimization level: {self.opt_level}")
+            raise ValueError(f"Unknown optimization level: {self.mres_perf_opt}")
 
         bk = self.grid.get_neon_backend()
         self.sk = neon.Skeleton(backend=bk)
diff --git a/xlb/mres_perf_optimization_type.py b/xlb/mres_perf_optimization_type.py
index ae14bcf1..0a1f968d 100644
--- a/xlb/mres_perf_optimization_type.py
+++ b/xlb/mres_perf_optimization_type.py
@@ -12,8 +12,8 @@ class MresPerfOptimizationType(Enum):
 
     NAIVE_COLLIDE_STREAM = auto()
     FUSION_AT_FINEST = auto()
-    FUSION_AT_FINEST_254 = auto()
-    FUSION_AT_FINEST_254_ALL = auto()
+    FUSION_AT_FINEST_SFV = auto()
+    FUSION_AT_FINEST_SFV_ALL = auto()
 
     @staticmethod
     def from_string(value: str) -> "MresPerfOptimizationType":
diff --git a/xlb/operator/boundary_condition/bc_do_nothing.py b/xlb/operator/boundary_condition/bc_do_nothing.py
index 9f355515..97b7825c 100644
--- a/xlb/operator/boundary_condition/bc_do_nothing.py
+++ b/xlb/operator/boundary_condition/bc_do_nothing.py
@@ -84,5 +84,5 @@ def _construct_neon(self):
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, f_pre, f_post, bc_mask, missing_mask):
-        # rise exception as this feature is not implemented yet
+        # raise exception as this feature is not implemented yet
         raise NotImplementedError("This feature is not implemented in XLB with the NEON backend yet.")
diff --git a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
index 9c7476b7..434ad542 100644
--- a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
+++ b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
@@ -254,5 +254,5 @@ def _construct_neon(self):
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, f_pre, f_post, bc_mask, missing_mask):
-        # rise exception as this feature is not implemented yet
+        # raise exception as this feature is not implemented yet
         raise NotImplementedError("This feature is not implemented in XLB with the NEON backend yet.")
diff --git a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
index e458efa7..fa78705e 100644
--- a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
@@ -179,5 +179,5 @@ def _construct_neon(self):
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, f_pre, f_post, bc_mask, missing_mask):
-        # rise exception as this feature is not implemented yet
+        # raise exception as this feature is not implemented yet
         raise NotImplementedError("This feature is not implemented in XLB with the NEON backend yet.")
diff --git a/xlb/operator/boundary_condition/bc_hybrid.py b/xlb/operator/boundary_condition/bc_hybrid.py
index e6623c0f..2e8d5db6 100644
--- a/xlb/operator/boundary_condition/bc_hybrid.py
+++ b/xlb/operator/boundary_condition/bc_hybrid.py
@@ -65,7 +65,7 @@ def __init__(
             raise NotImplementedError("This BC is not implemented in 2D!")
 
         # Check if the compute backend is Warp
-        assert self.compute_backend == ComputeBackend.WARP or ComputeBackend.NEON, "This BC is currently not supported by JAX backend!"
+        assert self.compute_backend in (ComputeBackend.WARP, ComputeBackend.NEON), "This BC is currently not supported by JAX backend!"
 
         # Instantiate the operator for computing macroscopic values
         # Explicitly using the WARP backend for these operators as they may also be called by the Neon backend.
@@ -345,5 +345,5 @@ def _construct_neon(self):
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, f_pre, f_post, bc_mask, missing_mask):
-        # rise exception as this feature is not implemented yet
+        # raise exception as this feature is not implemented yet
         raise NotImplementedError("This feature is not implemented in XLB with the NEON backend yet.")
diff --git a/xlb/operator/boundary_condition/bc_zouhe.py b/xlb/operator/boundary_condition/bc_zouhe.py
index 5a940c62..bc4efb44 100644
--- a/xlb/operator/boundary_condition/bc_zouhe.py
+++ b/xlb/operator/boundary_condition/bc_zouhe.py
@@ -391,5 +391,5 @@ def _construct_neon(self):
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, f_pre, f_post, bc_mask, missing_mask):
-        # rise exception as this feature is not implemented yet
+        # raise exception as this feature is not implemented yet
         raise NotImplementedError("This feature is not implemented in XLB with the NEON backend yet.")
diff --git a/xlb/operator/boundary_condition/boundary_condition.py b/xlb/operator/boundary_condition/boundary_condition.py
index 9041fbae..69624d70 100644
--- a/xlb/operator/boundary_condition/boundary_condition.py
+++ b/xlb/operator/boundary_condition/boundary_condition.py
@@ -56,7 +56,7 @@ def __init__(
         self.implementation_step = implementation_step
 
         # A flag to indicate whether bc indices need to be padded in both normal directions to identify missing directions
-        # when inside/outside of the geoemtry is not known
+        # when inside/outside of the geometry is not known
         self.needs_padding = False
 
         # A flag for BCs that need normalized distance between the grid and a mesh (to be set to True if applicable inside each BC)
diff --git a/xlb/operator/boundary_condition/helper_functions_bc.py b/xlb/operator/boundary_condition/helper_functions_bc.py
index 8e7ae7b9..e7732b07 100644
--- a/xlb/operator/boundary_condition/helper_functions_bc.py
+++ b/xlb/operator/boundary_condition/helper_functions_bc.py
@@ -476,7 +476,7 @@ def kernel(
 
     def _construct_neon(self):
         """
-        Constructs the Neon container for encoding auxilary data recovery.
+        Constructs the Neon container for encoding auxiliary data recovery.
         """
         # Use the warp functional for the Neon backend
         functional_dict, _ = self._construct_warp()
@@ -561,7 +561,7 @@ def __init__(
 
     def _construct_neon(self):
         """
-        Constructs the Neon container for encoding auxilary data recovery.
+        Constructs the Neon container for encoding auxiliary data recovery.
         """
 
         # Borrow the functional from the warp implementation
diff --git a/xlb/operator/boundary_masker/aabb.py b/xlb/operator/boundary_masker/aabb.py
index 5009e25a..cd5bb8c0 100644
--- a/xlb/operator/boundary_masker/aabb.py
+++ b/xlb/operator/boundary_masker/aabb.py
@@ -1,11 +1,12 @@
 import warp as wp
+import neon
 from typing import Any
 from xlb.velocity_set.velocity_set import VelocitySet
 from xlb.precision_policy import PrecisionPolicy
 from xlb.compute_backend import ComputeBackend
 from xlb.operator.boundary_masker.mesh_boundary_masker import MeshBoundaryMasker
 from xlb.operator.operator import Operator
-import neon
+from xlb.cell_type import BC_SOLID
 
 
 class MeshMaskerAABB(MeshBoundaryMasker):
@@ -49,9 +50,11 @@ def functional(
             cell_center_pos = self.helper_masker.index_to_position(bc_mask, index)
             HALF_VOXEL = wp.vec3(0.5, 0.5, 0.5)
 
-            if self.read_field(bc_mask, index, 0) == wp.uint8(255) or self.mesh_voxel_intersect(mesh_id=mesh_id, low=cell_center_pos - HALF_VOXEL):
+            if self.read_field(bc_mask, index, 0) == wp.uint8(BC_SOLID) or self.mesh_voxel_intersect(
+                mesh_id=mesh_id, low=cell_center_pos - HALF_VOXEL
+            ):
                 # Make solid voxel
-                self.write_field(bc_mask, index, 0, wp.uint8(255))
+                self.write_field(bc_mask, index, 0, wp.uint8(BC_SOLID))
             else:
                 # Find the boundary voxels and their missing directions
                 for direction_idx in range(_q):
diff --git a/xlb/operator/boundary_masker/aabb_close.py b/xlb/operator/boundary_masker/aabb_close.py
index ea3ac636..68398b61 100644
--- a/xlb/operator/boundary_masker/aabb_close.py
+++ b/xlb/operator/boundary_masker/aabb_close.py
@@ -9,6 +9,7 @@
 from xlb.compute_backend import ComputeBackend
 from xlb.operator.operator import Operator
 from xlb.operator.boundary_masker.mesh_boundary_masker import MeshBoundaryMasker
+from xlb.cell_type import BC_SOLID
 
 
 class MeshMaskerAABBClose(MeshBoundaryMasker):
@@ -68,7 +69,7 @@ def dilate_tile(mask_field: wp.array3d(dtype=Any), mask_field_out: wp.array3d(dt
         # Erode the solid mask in mask_field, removing a layer of outer solid voxels, storing output in mask_field_out
         @wp.func
         def functional_erode(index: Any, mask_field: Any, mask_field_out: Any):
-            min_val = wp.uint8(255)
+            min_val = wp.uint8(BC_SOLID)
             for l in range(_q):
                 if l == lattice_central_index:
                     continue
@@ -104,7 +105,7 @@ def functional_solid(index: Any, mesh_id: Any, solid_mask: Any, offset: Any):
 
             if self.mesh_voxel_intersect(mesh_id=mesh_id, low=cell_center_pos - half):
                 # Make solid voxel
-                self.write_field(solid_mask, index, 0, wp.uint8(255))
+                self.write_field(solid_mask, index, 0, wp.uint8(BC_SOLID))
 
         @wp.kernel
         def kernel_solid(
@@ -137,9 +138,9 @@ def functional_aabb(
             cell_center_pos = self.helper_masker.index_to_position(bc_mask, index)
             HALF_VOXEL = wp.vec3(0.5, 0.5, 0.5)
 
-            if self.read_field(solid_mask, index, 0) == wp.uint8(255) or self.read_field(bc_mask, index, 0) == wp.uint8(255):
+            if self.read_field(solid_mask, index, 0) == wp.uint8(BC_SOLID) or self.read_field(bc_mask, index, 0) == wp.uint8(BC_SOLID):
                 # Make solid voxel
-                self.write_field(bc_mask, index, 0, wp.uint8(255))
+                self.write_field(bc_mask, index, 0, wp.uint8(BC_SOLID))
             else:
                 # Find the boundary voxels and their missing directions
                 for direction_idx in range(_q):
@@ -152,7 +153,7 @@ def functional_aabb(
 
                     # Check to see if this neighbor is solid
                     if self.helper_masker.is_in_bounds(index, wp.vec3i(solid_mask.shape[0], solid_mask.shape[1], solid_mask.shape[2]), 1):
-                        if self.read_field(solid_mask, index + direction_idx, 0) == wp.uint8(255):
+                        if self.read_field(solid_mask, index + direction_idx, 0) == wp.uint8(BC_SOLID):
                             # We know we have a solid neighbor
                             # Set the boundary id and missing_mask
                             self.write_field(bc_mask, index, 0, wp.uint8(id_number))
@@ -198,9 +199,9 @@ def kernel(
             # position of the point
             cell_center_pos = self.helper_masker.index_to_position(bc_mask, index)
 
-            if solid_mask[i, j, k] == wp.uint8(255) or bc_mask[0, index[0], index[1], index[2]] == wp.uint8(255):
+            if solid_mask[i, j, k] == wp.uint8(BC_SOLID) or bc_mask[0, index[0], index[1], index[2]] == wp.uint8(BC_SOLID):
                 # Make solid voxel
-                bc_mask[0, index[0], index[1], index[2]] = wp.uint8(255)
+                bc_mask[0, index[0], index[1], index[2]] = wp.uint8(BC_SOLID)
             else:
                 # Find the boundary voxels and their missing directions
                 for direction_idx in range(_q):
@@ -210,8 +211,8 @@ def kernel(
                     direction_vec = wp.vec3f(wp.float32(_c[0, direction_idx]), wp.float32(_c[1, direction_idx]), wp.float32(_c[2, direction_idx]))
 
                     # Check to see if this neighbor is solid - this is super inefficient TODO: make it way better
-                    # if solid_mask[i,j,k] == wp.uint8(255):
-                    if solid_mask[i + _c[0, direction_idx], j + _c[1, direction_idx], k + _c[2, direction_idx]] == wp.uint8(255):
+                    # if solid_mask[i,j,k] == wp.uint8(BC_SOLID):
+                    if solid_mask[i + _c[0, direction_idx], j + _c[1, direction_idx], k + _c[2, direction_idx]] == wp.uint8(BC_SOLID):
                         # We know we have a solid neighbor
                         # Set the boundary id and missing_mask
                         bc_mask[0, index[0], index[1], index[2]] = wp.uint8(id_number)
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 343cc22d..b51a1c92 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -5,6 +5,7 @@
 import jax.numpy as jnp
 import numpy as np
 import warp as wp
+import neon
 
 from xlb.compute_backend import ComputeBackend
 from xlb.grid import grid_factory
@@ -12,7 +13,7 @@
 from xlb.operator.stream.stream import Stream
 from xlb.precision_policy import Precision
 from xlb.operator.boundary_masker.helper_functions_masker import HelperFunctionsMasker
-import neon
+from xlb.cell_type import BC_SOLID
 
 
 class IndicesBoundaryMasker(Operator):
@@ -153,10 +154,10 @@ def functional_domain_bounds(
                     continue
 
                 if is_interior[ii] == wp.uint8(True):
-                    # If the index is in the interior, we set that index to be a solid node (identified by 255)
+                    # If the index is in the interior, we set that index to be a solid node (identified by BC_SOLID)
                     # This information will be used in the next kernel to identify missing directions using the
                     # padded indices of the solid node that are associated with the boundary condition.
-                    self.write_field(bc_mask, index, 0, wp.uint8(255))
+                    self.write_field(bc_mask, index, 0, wp.uint8(BC_SOLID))
                     return
 
                 # Set bc_mask for all bc indices
@@ -206,7 +207,7 @@ def functional_interior_missing_mask(
 
                     # Check if pull index is a fluid node (bc_mask is zero for fluid nodes)
                     bc_mask_ngh = self.read_field_neighbor(bc_mask, index, offset, 0)
-                    if (self.helper_masker.is_in_bounds(pull_index, grid_shape)) and (bc_mask_ngh == wp.uint8(255)):
+                    if (self.helper_masker.is_in_bounds(pull_index, grid_shape)) and (bc_mask_ngh == wp.uint8(BC_SOLID)):
                         self.write_field(missing_mask, index, l, wp.uint8(True))
 
         # Construct the warp 3D kernel
diff --git a/xlb/operator/boundary_masker/multires_aabb_close.py b/xlb/operator/boundary_masker/multires_aabb_close.py
index 1693919b..2f4976df 100644
--- a/xlb/operator/boundary_masker/multires_aabb_close.py
+++ b/xlb/operator/boundary_masker/multires_aabb_close.py
@@ -1,11 +1,12 @@
 import warp as wp
+import neon
 from typing import Any
 from xlb.velocity_set.velocity_set import VelocitySet
 from xlb.precision_policy import PrecisionPolicy
 from xlb.compute_backend import ComputeBackend
 from xlb.operator.boundary_masker import MeshMaskerAABBClose
 from xlb.operator.operator import Operator
-import neon
+from xlb.cell_type import BC_SOLID
 
 
 class MultiresMeshMaskerAABBClose(MeshMaskerAABBClose):
@@ -66,8 +67,8 @@ def mres_functional_aabb(
             # If already solid or bc, mark solid
             solid_val = wp.neon_read(solid_mask_pn, index, 0)
             bc_val = wp.neon_read(bc_mask_pn, index, 0)
-            if solid_val == wp.uint8(255) or bc_val == wp.uint8(255):
-                wp.neon_write(bc_mask_pn, index, 0, wp.uint8(255))
+            if solid_val == wp.uint8(BC_SOLID) or bc_val == wp.uint8(BC_SOLID):
+                wp.neon_write(bc_mask_pn, index, 0, wp.uint8(BC_SOLID))
                 return
 
             # loop lattice directions
@@ -81,7 +82,7 @@ def mres_functional_aabb(
                 is_valid = wp.bool(False)
                 nval = wp.neon_read_ngh(solid_mask_pn, index, ngh, 0, wp.uint8(0), is_valid)
                 if is_valid:
-                    if nval == wp.uint8(255):
+                    if nval == wp.uint8(BC_SOLID):
                         # Found solid neighbor -> boundary cell
                         self.write_field(bc_mask_pn, index, 0, wp.uint8(id_number))
                         self.write_field(missing_mask_pn, index, _opp_indices[direction_idx], wp.uint8(True))
diff --git a/xlb/operator/boundary_masker/winding.py b/xlb/operator/boundary_masker/winding.py
index b44edc2d..52b157a9 100644
--- a/xlb/operator/boundary_masker/winding.py
+++ b/xlb/operator/boundary_masker/winding.py
@@ -5,6 +5,7 @@
 from xlb.compute_backend import ComputeBackend
 from xlb.operator.boundary_masker.mesh_boundary_masker import MeshBoundaryMasker
 from xlb.operator.operator import Operator
+from xlb.cell_type import BC_SOLID
 
 
 class MeshMaskerWinding(MeshBoundaryMasker):
@@ -59,7 +60,7 @@ def kernel(
                 # set point to be solid
                 if query.sign <= 0:  # TODO: fix this
                     # Make solid voxel
-                    bc_mask[0, index[0], index[1], index[2]] = wp.uint8(255)
+                    bc_mask[0, index[0], index[1], index[2]] = wp.uint8(BC_SOLID)
 
                     # Find the fractional distance to the mesh in each direction
                     for direction_idx in range(1, _q):
diff --git a/xlb/operator/equilibrium/__init__.py b/xlb/operator/equilibrium/__init__.py
index 372ae1f7..beb7bb5e 100644
--- a/xlb/operator/equilibrium/__init__.py
+++ b/xlb/operator/equilibrium/__init__.py
@@ -1,3 +1,3 @@
 from xlb.operator.equilibrium.equilibrium import Equilibrium
 from xlb.operator.equilibrium.quadratic_equilibrium import QuadraticEquilibrium
-from xlb.operator.equilibrium.mulltires_quadratic_equilibrium import MultiresQuadraticEquilibrium
+from xlb.operator.equilibrium.multires_quadratic_equilibrium import MultiresQuadraticEquilibrium
diff --git a/xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py b/xlb/operator/equilibrium/multires_quadratic_equilibrium.py
similarity index 100%
rename from xlb/operator/equilibrium/mulltires_quadratic_equilibrium.py
rename to xlb/operator/equilibrium/multires_quadratic_equilibrium.py
diff --git a/xlb/operator/macroscopic/first_moment.py b/xlb/operator/macroscopic/first_moment.py
index 2842ec1e..626767fd 100644
--- a/xlb/operator/macroscopic/first_moment.py
+++ b/xlb/operator/macroscopic/first_moment.py
@@ -86,5 +86,5 @@ def _construct_neon(self):
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, f, rho):
-        # rise exception as this feature is not implemented yet
+        # raise exception as this feature is not implemented yet
         raise NotImplementedError("This feature is not implemented in XLB with the NEON backend yet.")
diff --git a/xlb/operator/macroscopic/multires_macroscopic.py b/xlb/operator/macroscopic/multires_macroscopic.py
index 0de4f54a..4bc79b90 100644
--- a/xlb/operator/macroscopic/multires_macroscopic.py
+++ b/xlb/operator/macroscopic/multires_macroscopic.py
@@ -7,6 +7,7 @@
 from xlb.compute_backend import ComputeBackend
 from xlb.operator.operator import Operator
 from xlb.operator.macroscopic import Macroscopic, ZeroMoment, FirstMoment
+from xlb.cell_type import BC_SOLID
 
 
 class MultiresMacroscopic(Macroscopic):
@@ -55,7 +56,7 @@ def macroscopic_cl(gIdx: typing.Any):
 
                     _rho, _u = functional(_f)
 
-                    if _boundary_id == wp.uint8(255) or wp.neon_has_child(f, gIdx):
+                    if _boundary_id == wp.uint8(BC_SOLID) or wp.neon_has_child(f, gIdx):
                         _rho = self.compute_dtype(0.0)
                         for d in range(_d):
                             _u[d] = self.compute_dtype(0.0)
diff --git a/xlb/operator/macroscopic/second_moment.py b/xlb/operator/macroscopic/second_moment.py
index ee74bdd9..1a0a0f07 100644
--- a/xlb/operator/macroscopic/second_moment.py
+++ b/xlb/operator/macroscopic/second_moment.py
@@ -111,5 +111,5 @@ def _construct_neon(self):
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, f, rho):
-        # rise exception as this feature is not implemented yet
+        # raise exception as this feature is not implemented yet
         raise NotImplementedError("This feature is not implemented in XLB with the NEON backend yet.")
diff --git a/xlb/operator/macroscopic/zero_moment.py b/xlb/operator/macroscopic/zero_moment.py
index 0b7cd7b7..f536f8d7 100644
--- a/xlb/operator/macroscopic/zero_moment.py
+++ b/xlb/operator/macroscopic/zero_moment.py
@@ -66,5 +66,5 @@ def _construct_neon(self):
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, f, rho):
-        # rise exception as this feature is not implemented yet
+        # raise exception as this feature is not implemented yet
         raise NotImplementedError("This feature is not implemented in XLB with the NEON backend yet.")
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index f2e9490e..f42c60a8 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -25,6 +25,7 @@
     MultiresMeshMaskerRay,
 )
 from xlb.operator.boundary_condition.helper_functions_bc import MultiresEncodeAuxiliaryData
+from xlb.cell_type import BC_SFV, BC_SOLID
 
 """
 SFV = Simple Fluid Voxel: a fluid voxel that is not a BC nor is involved in explosion or coalescence
@@ -122,7 +123,7 @@ def ll_coalescence_count(loader: neon.Loader):
                 @wp.func
                 def cl_collide_coarse(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    if _boundary_id == wp.uint8(255):
+                    if _boundary_id == wp.uint8(BC_SOLID):
                         return
                     if not wp.neon_has_child(coalescence_factor_pn, index):
                         for l in range(self.velocity_set.q):
@@ -153,17 +154,15 @@ def loading(loader: neon.Loader):
                 @wp.func
                 def compute(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    if _boundary_id == wp.uint8(255):
+                    if _boundary_id == wp.uint8(BC_SOLID):
                         return
 
-                    are_we_a_halo_cell = wp.neon_has_child(coalescence_factor_pn, index)
-                    if are_we_a_halo_cell:
-                        # HERE: we are a halo cell so we just exit
+                    if wp.neon_has_child(coalescence_factor_pn, index):
+                        # we are a halo cell so we just exit
                         return
 
                     for l in range(self.velocity_set.q):
                         if l == lattice_central_index:
-                            # HERE, we skip the center direction
                             continue
 
                         pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
@@ -173,17 +172,11 @@ def compute(index: Any):
                             coalescence_factor_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level
                         )
 
-                        # if (!pin.hasChildren(cell, dir)) {
                         if not wp.neon_has_finer_ngh(coalescence_factor_pn, index, pull_direction):
                             pass
                         else:
-                            # HERE -> I have a finer ngh. in direction pull (opposite l)
-                            # Then I have to read from the halo on top of my finer ngh.
                             if has_ngh_at_same_level:
-                                # Full State
-                                # YES finer ngh. in the pull direction (opposite of l)
-                                # YES ngh. at the same level
-                                # -> **Coalescence**
+                                # Coalescence
                                 if coalescence_factor > self.compute_dtype(0):
                                     coalescence_factor = self.compute_dtype(1) / (self.compute_dtype(2) * coalescence_factor)
                                     wp.neon_write(coalescence_factor_pn, index, l, coalescence_factor)
@@ -383,7 +376,6 @@ def ll_collide_coarse(loader: neon.Loader):
                     f_1_pn = loader.get_mres_write_handle(f_1_fd)
 
                 # fake loading to enforce sequential step
-
                 bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
                 missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
 
@@ -393,12 +385,7 @@ def ll_collide_coarse(loader: neon.Loader):
                 @wp.func
                 def device(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    """
-                    The c++ version starts with the following, which I am not sure is right:
-                        if (type(cell, 0) == CellType::bulk ) {
-                    BC type cells should do collide too
-                    """
-                    if _boundary_id == wp.uint8(255):
+                    if _boundary_id == wp.uint8(BC_SOLID):
                         return
 
                     if not wp.neon_has_child(f_0_pn, index):
@@ -424,13 +411,11 @@ def device(index: Any):
                             if level < num_levels - 1:
                                 val = _f_post_collision[l]
                                 wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, val)
-                                # Verified that this is not needed: wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, val)
 
                             wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
                     else:
                         for l in range(self.velocity_set.q):
                             wp.neon_write(f_1_pn, index, l, self.compute_dtype(0))
-                            # Verified that this is not needed: wp.neon_write(f_0_pn, index, l, self.compute_dtype(0))
 
                 loader.declare_kernel(device)
 
@@ -465,7 +450,7 @@ def ll_collide_coarse(loader: neon.Loader):
                 @wp.func
                 def device(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    if _boundary_id != wp.uint8(254):
+                    if _boundary_id != wp.uint8(BC_SFV):
                         return
 
                     # Read thread data for populations, these are post streaming
@@ -506,7 +491,6 @@ def ll_collide_coarse(loader: neon.Loader):
                     f_1_pn = loader.get_mres_write_handle(f_1_fd)
 
                 # fake loading to enforce sequential step
-
                 bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
                 missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
 
@@ -516,14 +500,9 @@ def ll_collide_coarse(loader: neon.Loader):
                 @wp.func
                 def device(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    """
-                    The c++ version starts with the following, which I am not sure is right:
-                        if (type(cell, 0) == CellType::bulk ) {
-                    BC type cells should do collide too
-                    """
-                    if _boundary_id == wp.uint8(255):
+                    if _boundary_id == wp.uint8(BC_SOLID):
                         return
-                    if _boundary_id == wp.uint8(254):
+                    if _boundary_id == wp.uint8(BC_SFV):
                         return
                     if not wp.neon_has_child(f_0_pn, index):
                         # Read thread data for populations, these are post streaming
@@ -548,13 +527,11 @@ def device(index: Any):
                             if level < num_levels - 1:
                                 val = _f_post_collision[l]
                                 wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, val)
-                                # Verified that this is not needed: wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, val)
 
                             wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
                     else:
                         for l in range(self.velocity_set.q):
                             wp.neon_write(f_1_pn, index, l, self.compute_dtype(0))
-                            # Verified that this is not needed: wp.neon_write(f_0_pn, index, l, self.compute_dtype(0))
 
                 loader.declare_kernel(device)
 
@@ -572,13 +549,6 @@ def stream_coarse_step_ABC(
         ):
             num_levels = f_0_fd.get_grid().num_levels
 
-            # if level != 0:
-            #     # throw an exception
-            #     raise Exception("Only the finest level is supported for now")
-
-            # module op to define odd of even iteration
-            # od_or_even = wp.module("odd_or_even", "even")
-
             def ll_stream_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
@@ -596,12 +566,11 @@ def ll_stream_coarse(loader: neon.Loader):
                 @wp.func
                 def cl_stream_coarse(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    if _boundary_id == wp.uint8(255):
+                    if _boundary_id == wp.uint8(BC_SOLID):
                         return
 
-                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
-                    if are_we_a_halo_cell:
-                        # HERE: we are a halo cell so we just exit
+                    if wp.neon_has_child(f_0_pn, index):
+                        # we are a halo cell so we just exit
                         return
 
                     # do stream normally
@@ -612,7 +581,6 @@ def cl_stream_coarse(index: Any):
 
                     for l in range(self.velocity_set.q):
                         if l == lattice_central_index:
-                            # HERE, we skip the center direction
                             continue
 
                         pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
@@ -620,48 +588,22 @@ def cl_stream_coarse(index: Any):
                         has_ngh_at_same_level = wp.bool(False)
                         accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
 
-                        # if (!pin.hasChildren(cell, dir)) {
                         if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
-                            # NO finer ngh. in the pull direction (opposite of l)
                             if not has_ngh_at_same_level:
-                                # NO ngh. at the same level
-                                # COULD we have a ngh. at the courser level?
                                 if wp.neon_has_parent(f_0_pn, index):
-                                    # YES halo cell on top of us
                                     has_a_coarser_ngh = wp.bool(False)
                                     exploded_pop = wp.neon_lbm_read_coarser_ngh(
                                         f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
                                     )
                                     if has_a_coarser_ngh:
-                                        # Full state:
-                                        # NO finer ngh. in the pull direction (opposite of l)
-                                        # NO ngh. at the same level
-                                        # YES ghost cell on top of us
-                                        # YES courser ngh.
-                                        # -> **Explosion**
-                                        # wp.neon_write(f_1_pn, index, l, exploded_pop)
+                                        # Explosion
                                         _f_post_stream[l] = exploded_pop
                         else:
-                            # HERE -> I have a finer ngh. in direction pull (opposite l)
-                            # Then I have to read from the halo on top of my finer ngh.
                             if has_ngh_at_same_level:
-                                # if l == 10:
-                                #     wp.print(accumulated)
-                                #     glob = wp.neon_global_idx(f_1_pn, index)
-                                #     wp.neon_cuda_info()
-                                #     wp.neon_print(glob)
-                                #     wp.neon_level(f_1_pn)
-                                # accumulated = _w[l]
-                                # Full State
-                                # YES finer ngh. in the pull direction (opposite of l)
-                                # YES ngh. at the same level
-                                # -> **Coalescence**
+                                # Coalescence
                                 coalescence_factor = wp.neon_read(coalescence_factor_pn, index, l)
                                 accumulated = accumulated * coalescence_factor
-                                # wp.neon_write(f_1_pn, index, l, accumulated)
                                 _f_post_stream[l] = accumulated
-                            # else:
-                            #     wp.print("ERRRRRRORRRRRRRRRRRRRR")
 
                     # do non mres post-streaming corrections
                     _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
@@ -704,14 +646,13 @@ def ll_stream_coarse(loader: neon.Loader):
                 def cl_stream_coarse(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
 
-                    if _boundary_id == wp.uint8(254):
+                    if _boundary_id == wp.uint8(BC_SFV):
                         return
-                    if _boundary_id == wp.uint8(255):
+                    if _boundary_id == wp.uint8(BC_SOLID):
                         return
 
-                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
-                    if are_we_a_halo_cell:
-                        # HERE: we are a halo cell so we just exit
+                    if wp.neon_has_child(f_0_pn, index):
+                        # we are a halo cell so we just exit
                         return
 
                     # do stream normally
@@ -722,7 +663,6 @@ def cl_stream_coarse(index: Any):
 
                     for l in range(self.velocity_set.q):
                         if l == lattice_central_index:
-                            # HERE, we skip the center direction
                             continue
 
                         pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
@@ -730,48 +670,22 @@ def cl_stream_coarse(index: Any):
                         has_ngh_at_same_level = wp.bool(False)
                         accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
 
-                        # if (!pin.hasChildren(cell, dir)) {
                         if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
-                            # NO finer ngh. in the pull direction (opposite of l)
                             if not has_ngh_at_same_level:
-                                # NO ngh. at the same level
-                                # COULD we have a ngh. at the courser level?
                                 if wp.neon_has_parent(f_0_pn, index):
-                                    # YES halo cell on top of us
                                     has_a_coarser_ngh = wp.bool(False)
                                     exploded_pop = wp.neon_lbm_read_coarser_ngh(
                                         f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
                                     )
                                     if has_a_coarser_ngh:
-                                        # Full state:
-                                        # NO finer ngh. in the pull direction (opposite of l)
-                                        # NO ngh. at the same level
-                                        # YES ghost cell on top of us
-                                        # YES courser ngh.
-                                        # -> **Explosion**
-                                        # wp.neon_write(f_1_pn, index, l, exploded_pop)
+                                        # Explosion
                                         _f_post_stream[l] = exploded_pop
                         else:
-                            # HERE -> I have a finer ngh. in direction pull (opposite l)
-                            # Then I have to read from the halo on top of my finer ngh.
                             if has_ngh_at_same_level:
-                                # if l == 10:
-                                #     wp.print(accumulated)
-                                #     glob = wp.neon_global_idx(f_1_pn, index)
-                                #     wp.neon_cuda_info()
-                                #     wp.neon_print(glob)
-                                #     wp.neon_level(f_1_pn)
-                                # accumulated = _w[l]
-                                # Full State
-                                # YES finer ngh. in the pull direction (opposite of l)
-                                # YES ngh. at the same level
-                                # -> **Coalescence**
+                                # Coalescence
                                 coalescence_factor = wp.neon_read(coalescence_factor_pn, index, l)
                                 accumulated = accumulated * coalescence_factor
-                                # wp.neon_write(f_1_pn, index, l, accumulated)
                                 _f_post_stream[l] = accumulated
-                            # else:
-                            #     wp.print("ERRRRRRORRRRRRRRRRRRRR")
 
                     # do non mres post-streaming corrections
                     _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
@@ -795,7 +709,7 @@ def SFV_reset_bc_mask(
             missing_mask_fd: Any,
         ):
             """
-            Setting the BC type to 254 for SFVs
+            Setting the BC type to BC_SFV
             """
 
             def ll_stream_coarse(loader: neon.Loader):
@@ -811,14 +725,13 @@ def ll_stream_coarse(loader: neon.Loader):
                 @wp.func
                 def cl_stream_coarse(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    if _boundary_id == wp.uint8(255):
+                    if _boundary_id == wp.uint8(BC_SOLID):
                         return
                     if _boundary_id != 0:
                         return
 
-                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
-                    if are_we_a_halo_cell:
-                        # HERE: we are a halo cell so we just exit
+                    if wp.neon_has_child(f_0_pn, index):
+                        # we are a halo cell so we just exit
                         return
 
                     # do stream normally
@@ -829,7 +742,6 @@ def cl_stream_coarse(index: Any):
 
                     for l in range(self.velocity_set.q):
                         if l == lattice_central_index:
-                            # HERE, we skip the center direction
                             continue
 
                         pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
@@ -837,39 +749,21 @@ def cl_stream_coarse(index: Any):
                         has_ngh_at_same_level = wp.bool(False)
                         wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
 
-                        # if (!pin.hasChildren(cell, dir)) {
                         if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
-                            # NO finer ngh. in the pull direction (opposite of l)
                             if not has_ngh_at_same_level:
-                                # NO ngh. at the same level
-                                # COULD we have a ngh. at the courser level?
                                 if wp.neon_has_parent(f_0_pn, index):
-                                    # YES halo cell on top of us
                                     has_a_coarser_ngh = wp.bool(False)
                                     wp.neon_lbm_read_coarser_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh)
                                     if has_a_coarser_ngh:
-                                        # Full state:
-                                        # NO finer ngh. in the pull direction (opposite of l)
-                                        # NO ngh. at the same level
-                                        # YES ghost cell on top of us
-                                        # YES courser ngh.
-                                        # -> **Explosion**
+                                        # Explosion: not an SFV
                                         return
                         else:
-                            # HERE -> I have a finer ngh. in direction pull (opposite l)
-                            # Then I have to read from the halo on top of my finer ngh.
                             if has_ngh_at_same_level:
-                                # Full State
-                                # YES finer ngh. in the pull direction (opposite of l)
-                                # YES ngh. at the same level
-                                # -> **Coalescence**
+                                # Coalescence: not an SFV
                                 return
 
-                    # Only fluid voxels with the following properties can reach this line:
-                    # They are not BC voxels
-                    # They are not on a resolution jump -> they do not do coalescence or explosion
-                    # They are not mr halo cells
-                    wp.neon_write(bc_mask_pn, index, 0, wp.uint8(254))
+                    # Voxel is a pure fluid cell with no multi-resolution interactions — mark as SFV
+                    wp.neon_write(bc_mask_pn, index, 0, wp.uint8(BC_SFV))
 
                 loader.declare_kernel(cl_stream_coarse)
 
@@ -899,12 +793,11 @@ def ll_stream_coarse(loader: neon.Loader):
                 @wp.func
                 def cl_stream_coarse(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    if _boundary_id == wp.uint8(255):
+                    if _boundary_id == wp.uint8(BC_SOLID):
                         return
 
-                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
-                    if are_we_a_halo_cell:
-                        # HERE: we are a halo cell so we just exit
+                    if wp.neon_has_child(f_0_pn, index):
+                        # we are a halo cell so we just exit
                         return
 
                     # do stream normally
@@ -915,7 +808,6 @@ def cl_stream_coarse(index: Any):
 
                     for l in range(self.velocity_set.q):
                         wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
-                    # wp.print("stream_coarse")
 
                 loader.declare_kernel(cl_stream_coarse)
 
@@ -937,12 +829,12 @@ def ll_stream_coarse(loader: neon.Loader):
                 @wp.func
                 def cl_stream_coarse(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    if _boundary_id != wp.uint8(254):
+                    if _boundary_id != wp.uint8(BC_SFV):
                         return
-                    # 254 voxel type:
-                    # They are not BC voxels
-                    # They are not on a resolution jump -> they do not do coalescence or explosion
-                    # They are not mr halo cells
+                    # BC_SFV voxel type:
+                    #   - They are not BC voxels
+                    #   - They are not on a resolution jump -> they do not do coalescence or explosion
+                    #   - They are not mr halo cells
 
                     _missing_mask = _missing_mask_vec()
                     _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
@@ -981,17 +873,15 @@ def ll_stream_coarse(loader: neon.Loader):
                 @wp.func
                 def cl_stream_coarse(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    if _boundary_id == wp.uint8(255):
+                    if _boundary_id == wp.uint8(BC_SOLID):
                         return
 
-                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
-                    if are_we_a_halo_cell:
-                        # HERE: we are a halo cell so we just exit
+                    if wp.neon_has_child(f_0_pn, index):
+                        # we are a halo cell so we just exit
                         return
 
                     for l in range(self.velocity_set.q):
                         if l == lattice_central_index:
-                            # HERE, we skip the center direction
                             continue
 
                         pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
@@ -999,34 +889,19 @@ def cl_stream_coarse(index: Any):
                         has_ngh_at_same_level = wp.bool(False)
                         accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
 
-                        # if (!pin.hasChildren(cell, dir)) {
                         if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
-                            # NO finer ngh. in the pull direction (opposite of l)
                             if not has_ngh_at_same_level:
-                                # NO ngh. at the same level
-                                # COULD we have a ngh. at the courser level?
                                 if wp.neon_has_parent(f_0_pn, index):
-                                    # YES halo cell on top of us
                                     has_a_coarser_ngh = wp.bool(False)
                                     exploded_pop = wp.neon_lbm_read_coarser_ngh(
                                         f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
                                     )
                                     if has_a_coarser_ngh:
-                                        # Full state:
-                                        # NO finer ngh. in the pull direction (opposite of l)
-                                        # NO ngh. at the same level
-                                        # YES ghost cell on top of us
-                                        # YES courser ngh.
-                                        # -> **Explosion**
+                                        # Explosion
                                         wp.neon_write(f_1_pn, index, l, exploded_pop)
                         else:
-                            # HERE -> I have a finer ngh. in direction pull (opposite l)
-                            # Then I have to read from the halo on top of my finer ngh.
                             if has_ngh_at_same_level:
-                                # Full State
-                                # YES finer ngh. in the pull direction (opposite of l)
-                                # YES ngh. at the same level
-                                # -> **Coalescence**
+                                # Coalescence
                                 coalescence_factor = wp.neon_read(coalescence_factor_pn, index, l)
                                 accumulated = accumulated * coalescence_factor
                                 wp.neon_write(f_1_pn, index, l, accumulated)
@@ -1047,7 +922,6 @@ def finest_fused_pull(
             is_f1_the_explosion_src_field: bool,
         ):
             if level != 0:
-                # throw an exception
                 raise Exception("Only the finest level is supported for now")
             grid = f_0_fd.get_grid()
             num_levels = grid.num_levels
@@ -1071,12 +945,11 @@ def finest_fused_pull_launcher(loader: neon.Loader):
                 @wp.func
                 def finest_fused_pull_kernel(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    if _boundary_id == wp.uint8(255):
+                    if _boundary_id == wp.uint8(BC_SOLID):
                         return
 
-                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
-                    if are_we_a_halo_cell:
-                        # HERE: we are a halo cell so we just exit
+                    if wp.neon_has_child(f_0_pn, index):
+                        # we are a halo cell so we just exit
                         return
 
                     # do stream normally
@@ -1087,7 +960,6 @@ def finest_fused_pull_kernel(index: Any):
 
                     for l in range(self.velocity_set.q):
                         if l == lattice_central_index:
-                            # HERE, we skip the center direction
                             continue
 
                         pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
@@ -1095,12 +967,8 @@ def finest_fused_pull_kernel(index: Any):
                         has_ngh_at_same_level = wp.bool(False)
                         accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
 
-                        # NO finer ngh. in the pull direction (opposite of l)
                         if not has_ngh_at_same_level:
-                            # NO ngh. at the same level
-                            # COULD we have a ngh. at the courser level?
                             if wp.neon_has_parent(f_0_pn, index):
-                                # YES halo cell on top of us
                                 has_a_coarser_ngh = wp.bool(False)
                                 if is_f1_the_explosion_src_field:
                                     exploded_pop = wp.neon_lbm_read_coarser_ngh(
@@ -1111,13 +979,7 @@ def finest_fused_pull_kernel(index: Any):
                                         f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
                                     )
                                 if has_a_coarser_ngh:
-                                    # Full state:
-                                    # NO finer ngh. in the pull direction (opposite of l)
-                                    # NO ngh. at the same level
-                                    # YES ghost cell on top of us
-                                    # YES courser ngh.
-                                    # -> **Explosion**
-                                    # wp.neon_write(f_1_pn, index, l, exploded_pop)
+                                    # Explosion
                                     _f_post_stream[l] = exploded_pop
 
                     # do non mres post-streaming corrections
@@ -1163,7 +1025,6 @@ def CFV_finest_fused_pull(
             is_f1_the_explosion_src_field: bool,
         ):
             if level != 0:
-                # throw an exception
                 raise Exception("Only the finest level is supported for now")
             grid = f_0_fd.get_grid()
             num_levels = grid.num_levels
@@ -1187,14 +1048,13 @@ def finest_fused_pull_launcher(loader: neon.Loader):
                 @wp.func
                 def finest_fused_pull_kernel(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    if _boundary_id == wp.uint8(255):
+                    if _boundary_id == wp.uint8(BC_SOLID):
                         return
-                    if _boundary_id == wp.uint8(254):
+                    if _boundary_id == wp.uint8(BC_SFV):
                         return
 
-                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
-                    if are_we_a_halo_cell:
-                        # HERE: we are a halo cell so we just exit
+                    if wp.neon_has_child(f_0_pn, index):
+                        # we are a halo cell so we just exit
                         return
 
                     # do stream normally
@@ -1205,7 +1065,6 @@ def finest_fused_pull_kernel(index: Any):
 
                     for l in range(self.velocity_set.q):
                         if l == lattice_central_index:
-                            # HERE, we skip the center direction
                             continue
 
                         pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
@@ -1213,12 +1072,8 @@ def finest_fused_pull_kernel(index: Any):
                         has_ngh_at_same_level = wp.bool(False)
                         accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
 
-                        # NO finer ngh. in the pull direction (opposite of l)
                         if not has_ngh_at_same_level:
-                            # NO ngh. at the same level
-                            # COULD we have a ngh. at the courser level?
                             if wp.neon_has_parent(f_0_pn, index):
-                                # YES halo cell on top of us
                                 has_a_coarser_ngh = wp.bool(False)
                                 if is_f1_the_explosion_src_field:
                                     exploded_pop = wp.neon_lbm_read_coarser_ngh(
@@ -1229,12 +1084,7 @@ def finest_fused_pull_kernel(index: Any):
                                         f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
                                     )
                                 if has_a_coarser_ngh:
-                                    # Full state:
-                                    # NO finer ngh. in the pull direction (opposite of l)
-                                    # NO ngh. at the same level
-                                    # YES ghost cell on top of us
-                                    # YES courser ngh.
-                                    # -> **Explosion**
+                                    # Explosion
                                     _f_post_stream[l] = exploded_pop
 
                     # do non mres post-streaming corrections
@@ -1268,7 +1118,7 @@ def finest_fused_pull_kernel(index: Any):
 
             return finest_fused_pull_launcher
 
-        @neon.Container.factory(name="254_finest_fused_pull")
+        @neon.Container.factory(name="SFV_finest_fused_pull")
         def SFV_finest_fused_pull(
             level: int,
             f_0_fd: Any,
@@ -1278,7 +1128,6 @@ def SFV_finest_fused_pull(
             omega: Any,
         ):
             if level != 0:
-                # throw an exception
                 raise Exception("Only the finest level is supported for now")
 
             def finest_fused_pull_launcher(loader: neon.Loader):
@@ -1294,9 +1143,9 @@ def finest_fused_pull_launcher(loader: neon.Loader):
                 _w = self.velocity_set.w
 
                 @wp.func
-                def finest_fused_pull_kernel_254(index: Any):
+                def finest_fused_pull_kernel_SFV(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    if _boundary_id != wp.uint8(254):
+                    if _boundary_id != wp.uint8(BC_SFV):
                         return
 
                     # do stream normally
@@ -1312,7 +1161,7 @@ def finest_fused_pull_kernel_254(index: Any):
                     for l in range(self.velocity_set.q):
                         wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
 
-                loader.declare_kernel(finest_fused_pull_kernel_254)
+                loader.declare_kernel(finest_fused_pull_kernel_SFV)
 
             return finest_fused_pull_launcher
 
@@ -1340,16 +1189,14 @@ def ll_stream_coarse(loader: neon.Loader):
                 @wp.func
                 def cl_stream_coarse(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    if _boundary_id == wp.uint8(255):
+                    if _boundary_id == wp.uint8(BC_SOLID):
                         return
 
-                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
-                    if are_we_a_halo_cell:
-                        # HERE: we are a halo cell so we just exit
+                    if wp.neon_has_child(f_0_pn, index):
+                        # we are a halo cell so we just exit
                         return
 
                     _missing_mask = _missing_mask_vec()
-                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     # do stream normally
                     _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
                     _f1_thread, _missing_mask = neon_get_thread_data(f_1_pn, missing_mask_pn, index)
@@ -1367,7 +1214,6 @@ def cl_stream_coarse(index: Any):
             return ll_stream_coarse
 
         return None, {
-            # "single_step_finest": single_step_finest,
             "collide_coarse": collide_coarse,
             "stream_coarse_step_ABC": stream_coarse_step_ABC,
             "stream_coarse_step_A": stream_coarse_step_A,
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index f2685ba4..05e59a17 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -30,6 +30,7 @@
 from xlb.helper import check_bc_overlaps
 from xlb.helper.nse_fields import create_nse_fields
 from xlb.operator.boundary_condition.helper_functions_bc import EncodeAuxiliaryData
+from xlb.cell_type import BC_SOLID
 
 
 class IncompressibleNavierStokesStepper(Stepper):
@@ -275,7 +276,7 @@ def jax_implementation_push(self, f_0, f_1, bc_mask, missing_mask, omega, timest
 
         # Apply collision type boundary conditions
         for bc in self.boundary_conditions:
-            f_post_collision = bc.update_bc_auxilary_data(f_post_stream, f_post_collision, bc_mask, missing_mask)
+            f_post_collision = bc.update_bc_auxiliary_data(f_post_stream, f_post_collision, bc_mask, missing_mask)
             if bc.implementation_step == ImplementationStep.COLLISION:
                 f_post_collision = bc(
                     f_post_stream,
@@ -408,7 +409,7 @@ def kernel(
             index = wp.vec3i(i, j, k)
 
             _boundary_id = bc_mask[0, index[0], index[1], index[2]]
-            if _boundary_id == wp.uint8(255):
+            if _boundary_id == wp.uint8(BC_SOLID):
                 return
 
             # Apply streaming
@@ -565,7 +566,7 @@ def nse_stepper_ll(loader: neon.Loader):
                 @wp.func
                 def nse_stepper_cl(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    if _boundary_id == wp.uint8(255):
+                    if _boundary_id == wp.uint8(BC_SOLID):
                         return
                     # Apply streaming
                     _f_post_stream = self.stream.neon_functional(f_0_pn, index)
diff --git a/xlb/operator/stream/stream.py b/xlb/operator/stream/stream.py
index 3773ea4e..cda8547b 100644
--- a/xlb/operator/stream/stream.py
+++ b/xlb/operator/stream/stream.py
@@ -139,5 +139,5 @@ def functional(
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, f_0, f_1):
-        # rise exception as this feature is not implemented yet
+        # raise exception as this feature is not implemented yet
         raise NotImplementedError("This feature is not implemented in XLB with the NEON backend yet.")
diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index 20fcb092..c84a4063 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -63,7 +63,7 @@ def make_cuboid_mesh(voxel_size, cuboids, stl_filename):
     """
     # Load the mesh and get its bounding box
     mesh = trimesh.load_mesh(stl_filename, process=False)
-    assert not mesh.is_empty, ValueError("Loaded mesh is empty or invalid.")
+    assert not mesh.is_empty, "Loaded mesh is empty or invalid."
 
     mesh_vertices = mesh.vertices
     min_bound = mesh_vertices.min(axis=0)
@@ -481,7 +481,7 @@ def get_fields_data(self, field_neon_dict):
 
         # Ensure that this operator is called on multires grids
         grid_mres = next(iter(field_neon_dict.values())).get_grid()
-        assert grid_mres.name == "mGrid", f"Operation {self.__class__.__name} is only applicable to multi-resolution cases!"
+        assert grid_mres.name == "mGrid", f"Operation {self.__class__.__name__} is only applicable to multi-resolution cases!"
 
         for field_name in field_neon_dict.keys():
             assert field_name in self.field_name_cardinality_dict.keys(), (
diff --git a/xlb/utils/utils.py b/xlb/utils/utils.py
index 363b4991..bb1f7782 100644
--- a/xlb/utils/utils.py
+++ b/xlb/utils/utils.py
@@ -83,7 +83,7 @@ def save_image(fld, timestep=None, prefix=None, **kwargs):
     if len(fld.shape) > 3:
         raise ValueError("The input field should be 2D!")
     if len(fld.shape) == 3:
-        fld = np.sqrt(fld[0, ...] ** 2 + fld[0, ...] ** 2)
+        fld = np.sqrt(fld[0, ...] ** 2 + fld[1, ...] ** 2 + fld[2, ...] ** 2)
 
     plt.clf()
     kwargs.pop("cmap", None)
@@ -237,7 +237,7 @@ def rotate_geometry(indices, origin, axis, angle):
     return tuple(jnp.rint(indices_rotated).astype("int32").T)
 
 
-def voxelize_stl(stl_filename, length_lbm_unit=None, tranformation_matrix=None, pitch=None):
+def voxelize_stl(stl_filename, length_lbm_unit=None, transformation_matrix=None, pitch=None):
     """
     Converts an STL file to a voxelized mesh.
 
@@ -247,7 +247,7 @@ def voxelize_stl(stl_filename, length_lbm_unit=None, tranformation_matrix=None,
         The name of the STL file to be voxelized.
     length_lbm_unit : float, optional
         The unit length in LBM. Either this or 'pitch' must be provided.
-    tranformation_matrix : array-like, optional
+    transformation_matrix : array-like, optional
         A transformation matrix to be applied to the mesh before voxelization.
     pitch : float, optional
         The pitch of the voxel grid. Either this or 'length_lbm_unit' must be provided.
@@ -267,8 +267,8 @@ def voxelize_stl(stl_filename, length_lbm_unit=None, tranformation_matrix=None,
         raise ValueError("Either 'length_lbm_unit' or 'pitch' must be provided!")
     mesh = trimesh.load_mesh(stl_filename, process=False)
     length_phys_unit = mesh.extents.max()
-    if tranformation_matrix is not None:
-        mesh.apply_transform(tranformation_matrix)
+    if transformation_matrix is not None:
+        mesh.apply_transform(transformation_matrix)
     if pitch is None:
         pitch = length_phys_unit / length_lbm_unit
     mesh_voxelized = mesh.voxelized(pitch=pitch)

From fee369a65d73f755701bff795d8d25d860576952 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 10 Mar 2026 14:19:06 -0400
Subject: [PATCH 201/208] Refactoring to remove duplications.

- Unified multi-resolution recursion builder in `simulation_manager.py` to streamline the construction of simulation steps.
- Refactored nse_multires_stepper for improved clarity
- Updated performance optimization handling in `multires_momentum_transfer.py` to support multiple fusion strategies.
---
 xlb/helper/simulation_manager.py              | 370 +++--------
 .../force/multires_momentum_transfer.py       |  14 +-
 xlb/operator/stepper/nse_multires_stepper.py  | 611 +++++++-----------
 3 files changed, 327 insertions(+), 668 deletions(-)

diff --git a/xlb/helper/simulation_manager.py b/xlb/helper/simulation_manager.py
index 53f8a335..3d7e95b0 100644
--- a/xlb/helper/simulation_manager.py
+++ b/xlb/helper/simulation_manager.py
@@ -88,311 +88,101 @@ def step(self):
         self.iteration_idx = self.iteration_idx + 1
         self.sk.run()
 
-    # Construct the stepper skeleton
-    def _construct_stepper_skeleton(self):
-        self.app = []
-
-        def recursion_reference(level, app):
-            if level < 0:
-                return
-
-            omega = self.omega_list[level]
-
-            self.add_to_app(
-                app=app,
-                op_name="collide_coarse",
-                level=level,
-                f_0_fd=self.f_0,
-                f_1_fd=self.f_1,
-                bc_mask_fd=self.bc_mask,
-                missing_mask_fd=self.missing_mask,
-                omega=omega,
-                timestep=0,
-            )
-
-            recursion_reference(level - 1, app)
-            recursion_reference(level - 1, app)
-
-            # Swapping of f_0 and f_1
-            self.add_to_app(
-                app=app,
-                op_name="stream_coarse_step_ABC",
-                level=level,
-                f_0=self.f_1,
-                f_1=self.f_0,
-                bc_mask=self.bc_mask,
-                missing_mask=self.missing_mask,
-                omega=self.coalescence_factor,
-                timestep=0,
-            )
-
-        def recursion_fused_finest(level, app):
-            if level < 0:
-                return
-
-            omega = self.omega_list[level]
-
-            if level == 0:
-                self.add_to_app(
-                    app=app,
-                    op_name="finest_fused_pull",
-                    level=level,
-                    f_0_fd=self.f_0,
-                    f_1_fd=self.f_1,
-                    bc_mask_fd=self.bc_mask,
-                    missing_mask_fd=self.missing_mask,
-                    omega=omega,
-                    timestep=0,
-                    is_f1_the_explosion_src_field=True,
-                )
-                self.add_to_app(
-                    app=app,
-                    op_name="finest_fused_pull",
-                    level=level,
-                    f_0_fd=self.f_1,
-                    f_1_fd=self.f_0,
-                    bc_mask_fd=self.bc_mask,
-                    missing_mask_fd=self.missing_mask,
-                    omega=omega,
-                    timestep=0,
-                    is_f1_the_explosion_src_field=False,
-                )
-                return
-
-            self.add_to_app(
-                app=app,
-                op_name="collide_coarse",
-                level=level,
-                f_0_fd=self.f_0,
-                f_1_fd=self.f_1,
-                bc_mask_fd=self.bc_mask,
-                missing_mask_fd=self.missing_mask,
-                omega=omega,
-                timestep=0,
-            )
-
-            if level - 1 == 0:
-                recursion_fused_finest(level - 1, app)
-            else:
-                recursion_fused_finest(level - 1, app)
-                recursion_fused_finest(level - 1, app)
-            # Swapping of f_0 and f_1
-            self.add_to_app(
-                app=app,
-                op_name="stream_coarse_step_ABC",
-                level=level,
-                f_0_fd=self.f_1,
-                f_1_fd=self.f_0,
-                bc_mask_fd=self.bc_mask,
-                missing_mask_fd=self.missing_mask,
-                omega=self.coalescence_factor,
-                timestep=0,
-            )
-
-        def recursion_fused_finest_SFV(level, app):
-            if level < 0:
-                return
-
-            omega = self.omega_list[level]
-
-            if level == 0:
-                self.add_to_app(
-                    app=app,
-                    op_name="CFV_finest_fused_pull",
-                    level=level,
-                    f_0_fd=self.f_0,
-                    f_1_fd=self.f_1,
-                    bc_mask_fd=self.bc_mask,
-                    missing_mask_fd=self.missing_mask,
-                    omega=omega,
-                    timestep=0,
-                    is_f1_the_explosion_src_field=True,
-                )
-                self.add_to_app(
-                    app=app,
-                    op_name="SFV_finest_fused_pull",
-                    level=level,
-                    f_0_fd=self.f_0,
-                    f_1_fd=self.f_1,
-                    bc_mask_fd=self.bc_mask,
-                    missing_mask_fd=self.missing_mask,
-                    omega=omega,
-                )
-                self.add_to_app(
-                    app=app,
-                    op_name="CFV_finest_fused_pull",
-                    level=level,
-                    f_0_fd=self.f_1,
-                    f_1_fd=self.f_0,
-                    bc_mask_fd=self.bc_mask,
-                    missing_mask_fd=self.missing_mask,
-                    omega=omega,
-                    timestep=0,
-                    is_f1_the_explosion_src_field=False,
-                )
-                self.add_to_app(
-                    app=app,
-                    op_name="SFV_finest_fused_pull",
-                    level=level,
-                    f_0_fd=self.f_1,
-                    f_1_fd=self.f_0,
-                    bc_mask_fd=self.bc_mask,
-                    missing_mask_fd=self.missing_mask,
-                    omega=omega,
-                )
-                return
+    def _build_recursion(self, level, app, config):
+        """Unified multi-resolution recursion builder.
+
+        config keys:
+            finest_ops:         list of (op_name, swap_fields, extra_kwargs) for level 0,
+                                or None to treat level 0 like any coarse level.
+            coarse_collide_ops: list of op_names for coarse collision.
+            coarse_stream_ops:  list of (op_name, extra_kwargs) for coarse streaming.
+            fuse_finest:        if True, recurse once (not twice) when child is at level 0.
+        """
+        if level < 0:
+            return
 
-            self.add_to_app(
-                app=app,
-                op_name="collide_coarse",
-                level=level,
-                f_0_fd=self.f_0,
-                f_1_fd=self.f_1,
-                bc_mask_fd=self.bc_mask,
-                missing_mask_fd=self.missing_mask,
-                omega=omega,
-                timestep=0,
-            )
+        omega = self.omega_list[level]
+        fields = dict(f_0_fd=self.f_0, f_1_fd=self.f_1, bc_mask_fd=self.bc_mask, missing_mask_fd=self.missing_mask)
+        fields_swapped = dict(f_0_fd=self.f_1, f_1_fd=self.f_0, bc_mask_fd=self.bc_mask, missing_mask_fd=self.missing_mask)
 
-            if level - 1 == 0:
-                recursion_fused_finest_SFV(level - 1, app)
-            else:
-                recursion_fused_finest_SFV(level - 1, app)
-                recursion_fused_finest_SFV(level - 1, app)
-            # Swapping of f_0 and f_1
-            self.add_to_app(
-                app=app,
-                op_name="stream_coarse_step_ABC",
-                level=level,
-                f_0_fd=self.f_1,
-                f_1_fd=self.f_0,
-                bc_mask_fd=self.bc_mask,
-                missing_mask_fd=self.missing_mask,
-                omega=self.coalescence_factor,
-                timestep=0,
-            )
+        if level == 0 and config["finest_ops"] is not None:
+            for op_name, swap, extra in config["finest_ops"]:
+                base = fields_swapped if swap else fields
+                self.add_to_app(app=app, op_name=op_name, level=level, **base, omega=omega, **extra)
+            return
 
-        def recursion_fused_finest_SFV_all(level, app):
-            if level < 0:
-                return
+        for op_name in config["coarse_collide_ops"]:
+            self.add_to_app(app=app, op_name=op_name, level=level, **fields, omega=omega, timestep=0)
 
-            omega = self.omega_list[level]
+        if config["fuse_finest"] and level - 1 == 0:
+            self._build_recursion(level - 1, app, config)
+        else:
+            self._build_recursion(level - 1, app, config)
+            self._build_recursion(level - 1, app, config)
 
-            if level == 0:
-                self.add_to_app(
-                    app=app,
-                    op_name="CFV_finest_fused_pull",
-                    level=level,
-                    f_0_fd=self.f_0,
-                    f_1_fd=self.f_1,
-                    bc_mask_fd=self.bc_mask,
-                    missing_mask_fd=self.missing_mask,
-                    omega=omega,
-                    timestep=0,
-                    is_f1_the_explosion_src_field=True,
-                )
-                self.add_to_app(
-                    app=app,
-                    op_name="SFV_finest_fused_pull",
-                    level=level,
-                    f_0_fd=self.f_0,
-                    f_1_fd=self.f_1,
-                    bc_mask_fd=self.bc_mask,
-                    missing_mask_fd=self.missing_mask,
-                    omega=omega,
-                )
-                self.add_to_app(
-                    app=app,
-                    op_name="CFV_finest_fused_pull",
-                    level=level,
-                    f_0_fd=self.f_1,
-                    f_1_fd=self.f_0,
-                    bc_mask_fd=self.bc_mask,
-                    missing_mask_fd=self.missing_mask,
-                    omega=omega,
-                    timestep=0,
-                    is_f1_the_explosion_src_field=False,
-                )
-                self.add_to_app(
-                    app=app,
-                    op_name="SFV_finest_fused_pull",
-                    level=level,
-                    f_0_fd=self.f_1,
-                    f_1_fd=self.f_0,
-                    bc_mask_fd=self.bc_mask,
-                    missing_mask_fd=self.missing_mask,
-                    omega=omega,
-                )
-                return
+        for op_name, extra in config["coarse_stream_ops"]:
+            self.add_to_app(app=app, op_name=op_name, level=level, **fields_swapped, **extra)
 
-            self.add_to_app(
-                app=app,
-                op_name="CFV_collide_coarse",
-                level=level,
-                f_0_fd=self.f_0,
-                f_1_fd=self.f_1,
-                bc_mask_fd=self.bc_mask,
-                missing_mask_fd=self.missing_mask,
-                omega=omega,
-                timestep=0,
-            )
-            self.add_to_app(
-                app=app,
-                op_name="SFV_collide_coarse",
-                level=level,
-                f_0_fd=self.f_0,
-                f_1_fd=self.f_1,
-                bc_mask_fd=self.bc_mask,
-                missing_mask_fd=self.missing_mask,
-                omega=omega,
-                timestep=0,
-            )
+    def _construct_stepper_skeleton(self):
+        self.app = []
 
-            if level - 1 == 0:
-                recursion_fused_finest_SFV_all(level - 1, app)
-            else:
-                recursion_fused_finest_SFV_all(level - 1, app)
-                recursion_fused_finest_SFV_all(level - 1, app)
-            # Swapping of f_0 and f_1
-            self.add_to_app(
-                app=app,
-                op_name="SFV_stream_coarse_step_ABC",
-                level=level,
-                f_0_fd=self.f_1,
-                f_1_fd=self.f_0,
-                bc_mask_fd=self.bc_mask,
-                missing_mask_fd=self.missing_mask,
-                omega=self.coalescence_factor,
-                timestep=0,
-            )
-            self.add_to_app(
-                app=app,
-                op_name="SFV_stream_coarse_step",
-                level=level,
-                f_0_fd=self.f_1,
-                f_1_fd=self.f_0,
-                bc_mask_fd=self.bc_mask,
-                missing_mask_fd=self.missing_mask,
-            )
+        stream_abc = {"omega": self.coalescence_factor, "timestep": 0}
+
+        # Finest-level op descriptors: (op_name, swap_f0_f1, extra_kwargs)
+        fused_pull_finest = [
+            ("finest_fused_pull", False, {"timestep": 0, "is_f1_the_explosion_src_field": True}),
+            ("finest_fused_pull", True, {"timestep": 0, "is_f1_the_explosion_src_field": False}),
+        ]
+        sfv_fused_pull_finest = [
+            ("CFV_finest_fused_pull", False, {"timestep": 0, "is_f1_the_explosion_src_field": True}),
+            ("SFV_finest_fused_pull", False, {}),
+            ("CFV_finest_fused_pull", True, {"timestep": 0, "is_f1_the_explosion_src_field": False}),
+            ("SFV_finest_fused_pull", True, {}),
+        ]
+
+        configs = {
+            MresPerfOptimizationType.NAIVE_COLLIDE_STREAM: {
+                "finest_ops": None,
+                "coarse_collide_ops": ["collide_coarse"],
+                "coarse_stream_ops": [("stream_coarse_step_ABC", stream_abc)],
+                "fuse_finest": False,
+            },
+            MresPerfOptimizationType.FUSION_AT_FINEST: {
+                "finest_ops": fused_pull_finest,
+                "coarse_collide_ops": ["collide_coarse"],
+                "coarse_stream_ops": [("stream_coarse_step_ABC", stream_abc)],
+                "fuse_finest": True,
+            },
+            MresPerfOptimizationType.FUSION_AT_FINEST_SFV: {
+                "finest_ops": sfv_fused_pull_finest,
+                "coarse_collide_ops": ["collide_coarse"],
+                "coarse_stream_ops": [("stream_coarse_step_ABC", stream_abc)],
+                "fuse_finest": True,
+            },
+            MresPerfOptimizationType.FUSION_AT_FINEST_SFV_ALL: {
+                "finest_ops": sfv_fused_pull_finest,
+                "coarse_collide_ops": ["CFV_collide_coarse", "SFV_collide_coarse"],
+                "coarse_stream_ops": [("SFV_stream_coarse_step_ABC", stream_abc), ("SFV_stream_coarse_step", {})],
+                "fuse_finest": True,
+            },
+        }
+
+        config = configs.get(self.mres_perf_opt)
+        if config is None:
+            raise ValueError(f"Unknown optimization level: {self.mres_perf_opt}")
 
-        if self.mres_perf_opt == MresPerfOptimizationType.NAIVE_COLLIDE_STREAM:
-            recursion_reference(self.count_levels - 1, app=self.app)
-        elif self.mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST:
-            recursion_fused_finest(self.count_levels - 1, app=self.app)
-        elif self.mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST_SFV:
+        # Pre-recursion SFV mask setup
+        if self.mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST_SFV:
             wp.synchronize()
             self.neon_container["SFV_reset_bc_mask"](0, self.f_0, self.f_1, self.bc_mask, self.bc_mask).run(0)
             wp.synchronize()
-            recursion_fused_finest_SFV(self.count_levels - 1, app=self.app)
         elif self.mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST_SFV_ALL:
             wp.synchronize()
-            num_levels = self.f_0.get_grid().num_levels
-            for l in range(num_levels):
+            for l in range(self.f_0.get_grid().num_levels):
                 self.neon_container["SFV_reset_bc_mask"](l, self.f_0, self.f_1, self.bc_mask, self.bc_mask).run(0)
             wp.synchronize()
-            recursion_fused_finest_SFV_all(self.count_levels - 1, app=self.app)
-        else:
-            raise ValueError(f"Unknown optimization level: {self.mres_perf_opt}")
+
+        self._build_recursion(self.count_levels - 1, self.app, config)
 
         bk = self.grid.get_neon_backend()
         self.sk = neon.Skeleton(backend=bk)
diff --git a/xlb/operator/force/multires_momentum_transfer.py b/xlb/operator/force/multires_momentum_transfer.py
index 683b6dc1..485ae1e0 100644
--- a/xlb/operator/force/multires_momentum_transfer.py
+++ b/xlb/operator/force/multires_momentum_transfer.py
@@ -31,17 +31,21 @@ def __init__(
             raise NotImplementedError(f"Operator {self.__class__.__name__} not supported in {compute_backend} backend.")
 
         # Set the sequence of operations based on the performance optimization type
-        if mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST:
-            operation_sequence = LBMOperationSequence.STREAM_THEN_COLLIDE
-        elif mres_perf_opt == MresPerfOptimizationType.NAIVE_COLLIDE_STREAM:
+        if mres_perf_opt == MresPerfOptimizationType.NAIVE_COLLIDE_STREAM:
             operation_sequence = LBMOperationSequence.COLLIDE_THEN_STREAM
+        elif mres_perf_opt in (
+            MresPerfOptimizationType.FUSION_AT_FINEST,
+            MresPerfOptimizationType.FUSION_AT_FINEST_SFV,
+            MresPerfOptimizationType.FUSION_AT_FINEST_SFV_ALL,
+        ):
+            operation_sequence = LBMOperationSequence.STREAM_THEN_COLLIDE
         else:
             raise ValueError(f"Unknown performance optimization type: {mres_perf_opt}")
 
         # Check if the performance optimization type is compatible with the use of mesh distance
-        if mres_perf_opt != MresPerfOptimizationType.FUSION_AT_FINEST:
+        if operation_sequence != LBMOperationSequence.STREAM_THEN_COLLIDE:
             assert not no_slip_bc_instance.needs_mesh_distance, (
-                "MultiresMomentumTransfer operator does not support mesh distance for performance optimization other than fusion at the finest level."
+                "Mesh distance is only supported in the MultiresMomentumTransfer operator when the LBM operation sequence is STREAM_THEN_COLLIDE."
             )
 
         # Print a warning to the user about the boundary voxels
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index f42c60a8..143b41f5 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -262,11 +262,15 @@ def _initialize_auxiliary_data(boundary_conditions, f_1, bc_mask, missing_mask):
         return f_1
 
     def _construct_neon(self):
-        # Set local constants
+        # Pre-capture self attributes that Warp cannot resolve inside @wp.func bodies.
+        # Warp rejects `self` as an "Invalid external reference type" when it appears
+        # in a plain assignment (e.g. `_c = self.velocity_set.c`).  Capturing here
+        # makes these values available as simple closure variables.
         lattice_central_index = self.velocity_set.center_index
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
         _missing_mask_vec = wp.vec(self.velocity_set.q, dtype=wp.uint8)
         _opp_indices = self.velocity_set.opp_indices
+        _c = self.velocity_set.c
 
         # Read the list of bc_to_id created upon instantiation
         bc_to_id = boundary_condition_registry.bc_to_id
@@ -353,150 +357,172 @@ def neon_apply_aux_recovery_bc(
                                 _f1_thread = wp.neon_read(f_1_pn, index, _opp_indices[l])
                                 wp.neon_write(f_0_pn, index, _opp_indices[l], self.store_dtype(_f1_thread))
 
-        @neon.Container.factory(name="collide_coarse")
-        def collide_coarse(
-            level: int,
-            f_0_fd: Any,
-            f_1_fd: Any,
-            bc_mask_fd: Any,
-            missing_mask_fd: Any,
+        @wp.func
+        def neon_collide_and_write(
+            index: Any,
+            timestep: Any,
+            _boundary_id: Any,
+            _missing_mask: Any,
+            f_0_pn: Any,
+            f_1_pn: Any,
+            _f_post_stream: Any,
             omega: Any,
-            timestep: int,
+            num_levels: int,
+            level: int,
+            accumulation_pn: Any,
+            do_bc: bool,
+            do_accumulation: bool,
         ):
+            _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
+            _feq = self.equilibrium.neon_functional(_rho, _u)
+            _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, omega)
+
+            if do_bc:
+                _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False)
+                neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
+
+            if do_accumulation:
+                for l in range(self.velocity_set.q):
+                    push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
+                    if level < num_levels - 1:
+                        wp.neon_mres_lbm_store_op(accumulation_pn, index, l, push_direction, _f_post_collision[l])
+                    wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+            else:
+                for l in range(self.velocity_set.q):
+                    wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+
+            return _f_post_collision
+
+        @wp.func
+        def neon_stream_with_mres(
+            index: Any,
+            f_0_pn: Any,
+            coalescence_factor_pn: Any,
+        ):
+            _f_post_stream = self.stream.neon_functional(f_0_pn, index)
+
+            for l in range(self.velocity_set.q):
+                if l == lattice_central_index:
+                    continue
+
+                pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
+
+                has_ngh_at_same_level = wp.bool(False)
+                accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
+
+                if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
+                    if not has_ngh_at_same_level:
+                        if wp.neon_has_parent(f_0_pn, index):
+                            has_a_coarser_ngh = wp.bool(False)
+                            exploded_pop = wp.neon_lbm_read_coarser_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh)
+                            if has_a_coarser_ngh:
+                                _f_post_stream[l] = exploded_pop
+                else:
+                    if has_ngh_at_same_level:
+                        coalescence_factor = wp.neon_read(coalescence_factor_pn, index, l)
+                        accumulated = accumulated * coalescence_factor
+                        _f_post_stream[l] = accumulated
+
+            return _f_post_stream
+
+        @neon.Container.factory(name="collide_coarse")
+        def collide_coarse(level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any, omega: Any, timestep: int):
             num_levels = f_0_fd.get_grid().num_levels
 
-            def ll_collide_coarse(loader: neon.Loader):
+            def ll(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
-
                 if level + 1 < f_0_fd.get_grid().num_levels:
                     f_0_pn = loader.get_mres_write_handle(f_0_fd, neon.Loader.Operation.stencil_up)
                     f_1_pn = loader.get_mres_write_handle(f_1_fd, neon.Loader.Operation.stencil_up)
                 else:
                     f_0_pn = loader.get_mres_read_handle(f_0_fd)
                     f_1_pn = loader.get_mres_write_handle(f_1_fd)
-
-                # fake loading to enforce sequential step
                 bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
                 missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
 
-                _c = self.velocity_set.c
-                _w = self.velocity_set.w
-
                 @wp.func
                 def device(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id == wp.uint8(BC_SOLID):
                         return
-
                     if not wp.neon_has_child(f_0_pn, index):
-                        # Read thread data for populations, these are post streaming
                         _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
-                        _f_post_stream = _f0_thread
-
-                        _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
-                        _feq = self.equilibrium.neon_functional(_rho, _u)
-                        _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, omega)
-
-                        # Apply post-collision boundary conditions
-                        _f_post_collision = apply_bc(
-                            index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False
+                        neon_collide_and_write(
+                            index,
+                            timestep,
+                            _boundary_id,
+                            _missing_mask,
+                            f_0_pn,
+                            f_1_pn,
+                            _f0_thread,
+                            omega,
+                            num_levels,
+                            level,
+                            f_1_pn,
+                            True,
+                            True,
                         )
-
-                        # Apply auxiliary recovery for boundary conditions (swapping) before overwriting f_1
-                        neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
-
-                        # Accumulate the post-collision populations in f_0
-                        for l in range(self.velocity_set.q):
-                            push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
-                            if level < num_levels - 1:
-                                val = _f_post_collision[l]
-                                wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, val)
-
-                            wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
                     else:
                         for l in range(self.velocity_set.q):
                             wp.neon_write(f_1_pn, index, l, self.compute_dtype(0))
 
                 loader.declare_kernel(device)
 
-            return ll_collide_coarse
+            return ll
 
         @neon.Container.factory(name="SFV_collide_coarse")
-        def SFV_collide_coarse(
-            level: int,
-            f_0_fd: Any,
-            f_1_fd: Any,
-            bc_mask_fd: Any,
-            missing_mask_fd: Any,
-            omega: Any,
-            timestep: int,
-        ):
-            """
-            This container will execute the collision operator only on the SFV at the coarsest level.
-            """
+        def SFV_collide_coarse(level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any, omega: Any, timestep: int):
+            """Collision on SFV voxels only — no BCs, no multi-resolution accumulation."""
 
-            def ll_collide_coarse(loader: neon.Loader):
+            def ll(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
-
                 f_0_pn = loader.get_mres_read_handle(f_0_fd)
                 f_1_pn = loader.get_mres_write_handle(f_1_fd)
-
                 bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
                 missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
 
-                _c = self.velocity_set.c
-                _w = self.velocity_set.w
-
                 @wp.func
                 def device(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id != wp.uint8(BC_SFV):
                         return
-
-                    # Read thread data for populations, these are post streaming
                     _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
-                    _f_post_stream = _f0_thread
-
-                    _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
-                    _feq = self.equilibrium.neon_functional(_rho, _u)
-                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, omega)
-
-                    for l in range(self.velocity_set.q):
-                        wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+                    neon_collide_and_write(
+                        index,
+                        0,
+                        _boundary_id,
+                        _missing_mask,
+                        f_0_pn,
+                        f_1_pn,
+                        _f0_thread,
+                        omega,
+                        0,
+                        level,
+                        f_1_pn,
+                        False,
+                        False,
+                    )
 
                 loader.declare_kernel(device)
 
-            return ll_collide_coarse
+            return ll
 
         @neon.Container.factory(name="CFV_collide_coarse")
-        def CFV_collide_coarse(
-            level: int,
-            f_0_fd: Any,
-            f_1_fd: Any,
-            bc_mask_fd: Any,
-            missing_mask_fd: Any,
-            omega: Any,
-            timestep: int,
-        ):
+        def CFV_collide_coarse(level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any, omega: Any, timestep: int):
+            """Collision on CFV voxels only — skips both solid and SFV."""
             num_levels = f_0_fd.get_grid().num_levels
 
-            def ll_collide_coarse(loader: neon.Loader):
+            def ll(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
-
                 if level + 1 < f_0_fd.get_grid().num_levels:
                     f_0_pn = loader.get_mres_write_handle(f_0_fd, neon.Loader.Operation.stencil_up)
                     f_1_pn = loader.get_mres_write_handle(f_1_fd, neon.Loader.Operation.stencil_up)
                 else:
                     f_0_pn = loader.get_mres_read_handle(f_0_fd)
                     f_1_pn = loader.get_mres_write_handle(f_1_fd)
-
-                # fake loading to enforce sequential step
                 bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
                 missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
 
-                _c = self.velocity_set.c
-                _w = self.velocity_set.w
-
                 @wp.func
                 def device(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
@@ -505,200 +531,97 @@ def device(index: Any):
                     if _boundary_id == wp.uint8(BC_SFV):
                         return
                     if not wp.neon_has_child(f_0_pn, index):
-                        # Read thread data for populations, these are post streaming
                         _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
-                        _f_post_stream = _f0_thread
-
-                        _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
-                        _feq = self.equilibrium.neon_functional(_rho, _u)
-                        _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, omega)
-
-                        # Apply post-collision boundary conditions
-                        _f_post_collision = apply_bc(
-                            index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False
+                        neon_collide_and_write(
+                            index,
+                            timestep,
+                            _boundary_id,
+                            _missing_mask,
+                            f_0_pn,
+                            f_1_pn,
+                            _f0_thread,
+                            omega,
+                            num_levels,
+                            level,
+                            f_1_pn,
+                            True,
+                            True,
                         )
-
-                        # Apply auxiliary recovery for boundary conditions (swapping) before overwriting f_1
-                        neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
-
-                        # Accumulate the post-collision populations in f_0
-                        for l in range(self.velocity_set.q):
-                            push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
-                            if level < num_levels - 1:
-                                val = _f_post_collision[l]
-                                wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, val)
-
-                            wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
                     else:
                         for l in range(self.velocity_set.q):
                             wp.neon_write(f_1_pn, index, l, self.compute_dtype(0))
 
                 loader.declare_kernel(device)
 
-            return ll_collide_coarse
+            return ll
 
         @neon.Container.factory(name="stream_coarse_step_ABC")
-        def stream_coarse_step_ABC(
-            level: int,
-            f_0_fd: Any,
-            f_1_fd: Any,
-            bc_mask_fd: Any,
-            missing_mask_fd: Any,
-            omega: Any,
-            timestep: int,
-        ):
-            num_levels = f_0_fd.get_grid().num_levels
-
-            def ll_stream_coarse(loader: neon.Loader):
+        def stream_coarse_step_ABC(level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any, omega: Any, timestep: int):
+            def ll(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
-
                 f_0_pn = loader.get_mres_read_handle(f_0_fd)
                 f_1_pn = loader.get_mres_write_handle(f_1_fd)
-
                 bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
                 missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
-
-                _c = self.velocity_set.c
-
-                coalescence_factor_fd = omega
-                coalescence_factor_pn = loader.get_mres_read_handle(coalescence_factor_fd)
+                coalescence_factor_pn = loader.get_mres_read_handle(omega)
 
                 @wp.func
-                def cl_stream_coarse(index: Any):
+                def device(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id == wp.uint8(BC_SOLID):
                         return
-
                     if wp.neon_has_child(f_0_pn, index):
-                        # we are a halo cell so we just exit
                         return
 
-                    # do stream normally
-                    _missing_mask = _missing_mask_vec()
                     _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
                     _f_post_collision = _f0_thread
-                    _f_post_stream = self.stream.neon_functional(f_0_pn, index)
-
-                    for l in range(self.velocity_set.q):
-                        if l == lattice_central_index:
-                            continue
+                    _f_post_stream = neon_stream_with_mres(index, f_0_pn, coalescence_factor_pn)
 
-                        pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
-
-                        has_ngh_at_same_level = wp.bool(False)
-                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
-
-                        if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
-                            if not has_ngh_at_same_level:
-                                if wp.neon_has_parent(f_0_pn, index):
-                                    has_a_coarser_ngh = wp.bool(False)
-                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(
-                                        f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
-                                    )
-                                    if has_a_coarser_ngh:
-                                        # Explosion
-                                        _f_post_stream[l] = exploded_pop
-                        else:
-                            if has_ngh_at_same_level:
-                                # Coalescence
-                                coalescence_factor = wp.neon_read(coalescence_factor_pn, index, l)
-                                accumulated = accumulated * coalescence_factor
-                                _f_post_stream[l] = accumulated
-
-                    # do non mres post-streaming corrections
                     _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
-
-                    # Apply auxiliary recovery for boundary conditions (swapping) before overwriting f_1
                     neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
 
                     for l in range(self.velocity_set.q):
                         wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
 
-                loader.declare_kernel(cl_stream_coarse)
+                loader.declare_kernel(device)
 
-            return ll_stream_coarse
+            return ll
 
         @neon.Container.factory(name="SFV_stream_coarse_step_ABC")
-        def SFV_stream_coarse_step_ABC(
-            level: int,
-            f_0_fd: Any,
-            f_1_fd: Any,
-            bc_mask_fd: Any,
-            missing_mask_fd: Any,
-            omega: Any,
-            timestep: int,
-        ):
-            def ll_stream_coarse(loader: neon.Loader):
-                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+        def SFV_stream_coarse_step_ABC(level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any, omega: Any, timestep: int):
+            """Stream on CFV voxels only — skips SFV and solid."""
 
+            def ll(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
                 f_0_pn = loader.get_mres_read_handle(f_0_fd)
                 f_1_pn = loader.get_mres_write_handle(f_1_fd)
-
                 bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
                 missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
-
-                _c = self.velocity_set.c
-
-                coalescence_factor_fd = omega
-                coalescence_factor_pn = loader.get_mres_read_handle(coalescence_factor_fd)
+                coalescence_factor_pn = loader.get_mres_read_handle(omega)
 
                 @wp.func
-                def cl_stream_coarse(index: Any):
+                def device(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-
                     if _boundary_id == wp.uint8(BC_SFV):
                         return
                     if _boundary_id == wp.uint8(BC_SOLID):
                         return
-
                     if wp.neon_has_child(f_0_pn, index):
-                        # we are a halo cell so we just exit
                         return
 
-                    # do stream normally
-                    _missing_mask = _missing_mask_vec()
                     _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
                     _f_post_collision = _f0_thread
-                    _f_post_stream = self.stream.neon_functional(f_0_pn, index)
-
-                    for l in range(self.velocity_set.q):
-                        if l == lattice_central_index:
-                            continue
-
-                        pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
-
-                        has_ngh_at_same_level = wp.bool(False)
-                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
-
-                        if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
-                            if not has_ngh_at_same_level:
-                                if wp.neon_has_parent(f_0_pn, index):
-                                    has_a_coarser_ngh = wp.bool(False)
-                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(
-                                        f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
-                                    )
-                                    if has_a_coarser_ngh:
-                                        # Explosion
-                                        _f_post_stream[l] = exploded_pop
-                        else:
-                            if has_ngh_at_same_level:
-                                # Coalescence
-                                coalescence_factor = wp.neon_read(coalescence_factor_pn, index, l)
-                                accumulated = accumulated * coalescence_factor
-                                _f_post_stream[l] = accumulated
+                    _f_post_stream = neon_stream_with_mres(index, f_0_pn, coalescence_factor_pn)
 
-                    # do non mres post-streaming corrections
                     _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
-
-                    # Apply auxiliary recovery for boundary conditions (swapping) before overwriting f_1
                     neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
 
                     for l in range(self.velocity_set.q):
                         wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
 
-                loader.declare_kernel(cl_stream_coarse)
+                loader.declare_kernel(device)
 
-            return ll_stream_coarse
+            return ll
 
         @neon.Container.factory(name="SFV_reset_bc_mask")
         def SFV_reset_bc_mask(
@@ -910,6 +833,34 @@ def cl_stream_coarse(index: Any):
 
             return ll_stream_coarse
 
+        @wp.func
+        def neon_stream_finest_with_explosion(
+            index: Any,
+            f_0_pn: Any,
+            explosion_src_pn: Any,
+        ):
+            _f_post_stream = self.stream.neon_functional(f_0_pn, index)
+
+            for l in range(self.velocity_set.q):
+                if l == lattice_central_index:
+                    continue
+
+                pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
+
+                has_ngh_at_same_level = wp.bool(False)
+                accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
+
+                if not has_ngh_at_same_level:
+                    if wp.neon_has_parent(f_0_pn, index):
+                        has_a_coarser_ngh = wp.bool(False)
+                        exploded_pop = wp.neon_lbm_read_coarser_ngh(
+                            explosion_src_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
+                        )
+                        if has_a_coarser_ngh:
+                            _f_post_stream[l] = exploded_pop
+
+            return _f_post_stream
+
         @neon.Container.factory(name="finest_fused_pull")
         def finest_fused_pull(
             level: int,
@@ -923,95 +874,55 @@ def finest_fused_pull(
         ):
             if level != 0:
                 raise Exception("Only the finest level is supported for now")
-            grid = f_0_fd.get_grid()
-            num_levels = grid.num_levels
+            num_levels = f_0_fd.get_grid().num_levels
 
-            def finest_fused_pull_launcher(loader: neon.Loader):
+            def ll(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
-
                 if level + 1 < f_0_fd.get_grid().num_levels:
                     f_0_pn = loader.get_mres_write_handle(f_0_fd, neon.Loader.Operation.stencil_up)
                     f_1_pn = loader.get_mres_write_handle(f_1_fd, neon.Loader.Operation.stencil_up)
                 else:
                     f_0_pn = loader.get_mres_read_handle(f_0_fd)
                     f_1_pn = loader.get_mres_write_handle(f_1_fd)
-
                 bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
                 missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
-
-                _c = self.velocity_set.c
-                _w = self.velocity_set.w
+                explosion_src_pn = f_1_pn if is_f1_the_explosion_src_field else f_0_pn
+                accumulation_pn = f_1_pn if is_f1_the_explosion_src_field else f_0_pn
 
                 @wp.func
-                def finest_fused_pull_kernel(index: Any):
+                def device(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id == wp.uint8(BC_SOLID):
                         return
-
                     if wp.neon_has_child(f_0_pn, index):
-                        # we are a halo cell so we just exit
                         return
 
-                    # do stream normally
-                    _missing_mask = _missing_mask_vec()
                     _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
                     _f_post_collision = _f0_thread
-                    _f_post_stream = self.stream.neon_functional(f_0_pn, index)
+                    _f_post_stream = neon_stream_finest_with_explosion(index, f_0_pn, explosion_src_pn)
 
-                    for l in range(self.velocity_set.q):
-                        if l == lattice_central_index:
-                            continue
-
-                        pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
-
-                        has_ngh_at_same_level = wp.bool(False)
-                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
-
-                        if not has_ngh_at_same_level:
-                            if wp.neon_has_parent(f_0_pn, index):
-                                has_a_coarser_ngh = wp.bool(False)
-                                if is_f1_the_explosion_src_field:
-                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(
-                                        f_1_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
-                                    )
-                                else:
-                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(
-                                        f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
-                                    )
-                                if has_a_coarser_ngh:
-                                    # Explosion
-                                    _f_post_stream[l] = exploded_pop
-
-                    # do non mres post-streaming corrections
+                    # Post-streaming boundary conditions
                     _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
 
-                    _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
-                    _feq = self.equilibrium.neon_functional(_rho, _u)
-                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, omega)
-
-                    # Apply post-collision boundary conditions
-                    _f_post_collision = apply_bc(
-                        index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False
+                    neon_collide_and_write(
+                        index,
+                        timestep,
+                        _boundary_id,
+                        _missing_mask,
+                        f_0_pn,
+                        f_1_pn,
+                        _f_post_stream,
+                        omega,
+                        num_levels,
+                        level,
+                        accumulation_pn,
+                        True,
+                        True,
                     )
 
-                    # Apply auxiliary recovery for boundary conditions (swapping) before overwriting f_1
-                    neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
-
-                    # Accumulate the post-collision populations in f_0
-                    for l in range(self.velocity_set.q):
-                        push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
-                        if level < num_levels - 1:
-                            val = _f_post_collision[l]
-                            if is_f1_the_explosion_src_field:
-                                wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, val)
-                            else:
-                                wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, val)
-
-                        wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
-
-                loader.declare_kernel(finest_fused_pull_kernel)
+                loader.declare_kernel(device)
 
-            return finest_fused_pull_launcher
+            return ll
 
         @neon.Container.factory(name="CFV_finest_fused_pull")
         def CFV_finest_fused_pull(
@@ -1024,146 +935,100 @@ def CFV_finest_fused_pull(
             timestep: Any,
             is_f1_the_explosion_src_field: bool,
         ):
+            """Fused stream+collide on CFV voxels at the finest level — skips SFV and solid."""
             if level != 0:
                 raise Exception("Only the finest level is supported for now")
-            grid = f_0_fd.get_grid()
-            num_levels = grid.num_levels
+            num_levels = f_0_fd.get_grid().num_levels
 
-            def finest_fused_pull_launcher(loader: neon.Loader):
+            def ll(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
-
                 if level + 1 < f_0_fd.get_grid().num_levels:
                     f_0_pn = loader.get_mres_write_handle(f_0_fd, neon.Loader.Operation.stencil_up)
                     f_1_pn = loader.get_mres_write_handle(f_1_fd, neon.Loader.Operation.stencil_up)
                 else:
                     f_0_pn = loader.get_mres_read_handle(f_0_fd)
                     f_1_pn = loader.get_mres_write_handle(f_1_fd)
-
                 bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
                 missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
-
-                _c = self.velocity_set.c
-                _w = self.velocity_set.w
+                explosion_src_pn = f_1_pn if is_f1_the_explosion_src_field else f_0_pn
+                accumulation_pn = f_1_pn if is_f1_the_explosion_src_field else f_0_pn
 
                 @wp.func
-                def finest_fused_pull_kernel(index: Any):
+                def device(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id == wp.uint8(BC_SOLID):
                         return
                     if _boundary_id == wp.uint8(BC_SFV):
                         return
-
                     if wp.neon_has_child(f_0_pn, index):
-                        # we are a halo cell so we just exit
                         return
 
-                    # do stream normally
-                    _missing_mask = _missing_mask_vec()
                     _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
                     _f_post_collision = _f0_thread
-                    _f_post_stream = self.stream.neon_functional(f_0_pn, index)
-
-                    for l in range(self.velocity_set.q):
-                        if l == lattice_central_index:
-                            continue
-
-                        pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
-
-                        has_ngh_at_same_level = wp.bool(False)
-                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
+                    _f_post_stream = neon_stream_finest_with_explosion(index, f_0_pn, explosion_src_pn)
 
-                        if not has_ngh_at_same_level:
-                            if wp.neon_has_parent(f_0_pn, index):
-                                has_a_coarser_ngh = wp.bool(False)
-                                if is_f1_the_explosion_src_field:
-                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(
-                                        f_1_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
-                                    )
-                                else:
-                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(
-                                        f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
-                                    )
-                                if has_a_coarser_ngh:
-                                    # Explosion
-                                    _f_post_stream[l] = exploded_pop
-
-                    # do non mres post-streaming corrections
+                    # Post-streaming boundary conditions
                     _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
 
-                    _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
-                    _feq = self.equilibrium.neon_functional(_rho, _u)
-                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, omega)
-
-                    # Apply post-collision boundary conditions
-                    _f_post_collision = apply_bc(
-                        index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False
+                    neon_collide_and_write(
+                        index,
+                        timestep,
+                        _boundary_id,
+                        _missing_mask,
+                        f_0_pn,
+                        f_1_pn,
+                        _f_post_stream,
+                        omega,
+                        num_levels,
+                        level,
+                        accumulation_pn,
+                        True,
+                        True,
                     )
 
-                    # Apply auxiliary recovery for boundary conditions (swapping) before overwriting f_1
-                    neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
-
-                    # Accumulate the post-collision populations in f_0
-                    for l in range(self.velocity_set.q):
-                        push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
-                        if level < num_levels - 1:
-                            val = _f_post_collision[l]
-                            if is_f1_the_explosion_src_field:
-                                wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, val)
-                            else:
-                                wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, val)
-
-                        wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
-
-                loader.declare_kernel(finest_fused_pull_kernel)
+                loader.declare_kernel(device)
 
-            return finest_fused_pull_launcher
+            return ll
 
         @neon.Container.factory(name="SFV_finest_fused_pull")
-        def SFV_finest_fused_pull(
-            level: int,
-            f_0_fd: Any,
-            f_1_fd: Any,
-            bc_mask_fd: Any,
-            missing_mask_fd: Any,
-            omega: Any,
-        ):
+        def SFV_finest_fused_pull(level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any, omega: Any):
+            """Fused stream+collide on SFV voxels at the finest level — no BCs, no explosion."""
             if level != 0:
                 raise Exception("Only the finest level is supported for now")
 
-            def finest_fused_pull_launcher(loader: neon.Loader):
+            def ll(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
-
                 f_0_pn = loader.get_mres_read_handle(f_0_fd)
                 f_1_pn = loader.get_mres_write_handle(f_1_fd)
-
                 bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
                 missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
 
-                _c = self.velocity_set.c
-                _w = self.velocity_set.w
-
                 @wp.func
-                def finest_fused_pull_kernel_SFV(index: Any):
+                def device(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id != wp.uint8(BC_SFV):
                         return
-
-                    # do stream normally
-                    _missing_mask = _missing_mask_vec()
                     _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
-                    _f_post_collision = _f0_thread
                     _f_post_stream = self.stream.neon_functional(f_0_pn, index)
+                    neon_collide_and_write(
+                        index,
+                        0,
+                        _boundary_id,
+                        _missing_mask,
+                        f_0_pn,
+                        f_1_pn,
+                        _f_post_stream,
+                        omega,
+                        0,
+                        0,
+                        f_1_pn,
+                        False,
+                        False,
+                    )
 
-                    _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
-                    _feq = self.equilibrium.neon_functional(_rho, _u)
-                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, omega)
-
-                    for l in range(self.velocity_set.q):
-                        wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
-
-                loader.declare_kernel(finest_fused_pull_kernel_SFV)
+                loader.declare_kernel(device)
 
-            return finest_fused_pull_launcher
+            return ll
 
         @neon.Container.factory(name="stream_coarse_step_C")
         def stream_coarse_step_C(

From 22ecca8fd538b798cef9bc0e979e7a45f379dc6f Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 10 Mar 2026 14:42:42 -0400
Subject: [PATCH 202/208] Removed redundant functions from
 `nse_multires_stepper.py` to streamline and clarify the implementation of
 multi-resolution streaming steps.

---
 xlb/operator/stepper/nse_multires_stepper.py | 157 -------------------
 1 file changed, 157 deletions(-)

diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 143b41f5..3894eb7c 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -692,50 +692,6 @@ def cl_stream_coarse(index: Any):
 
             return ll_stream_coarse
 
-        @neon.Container.factory(name="stream_coarse_step_A")
-        def stream_coarse_step_A(
-            level: int,
-            f_0_fd: Any,
-            f_1_fd: Any,
-            bc_mask_fd: Any,
-            missing_mask_fd: Any,
-            omega: Any,
-            timestep: int,
-        ):
-            def ll_stream_coarse(loader: neon.Loader):
-                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
-
-                f_0_pn = loader.get_mres_read_handle(f_0_fd)
-                f_1_pn = loader.get_mres_write_handle(f_1_fd)
-
-                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
-                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
-
-                _c = self.velocity_set.c
-
-                @wp.func
-                def cl_stream_coarse(index: Any):
-                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    if _boundary_id == wp.uint8(BC_SOLID):
-                        return
-
-                    if wp.neon_has_child(f_0_pn, index):
-                        # we are a halo cell so we just exit
-                        return
-
-                    # do stream normally
-                    _missing_mask = _missing_mask_vec()
-                    _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
-                    _f_post_collision = _f0_thread
-                    _f_post_stream = self.stream.neon_functional(f_0_pn, index)
-
-                    for l in range(self.velocity_set.q):
-                        wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
-
-                loader.declare_kernel(cl_stream_coarse)
-
-            return ll_stream_coarse
-
         @neon.Container.factory(name="SFV_stream_coarse_step")
         def SFV_stream_coarse_step(level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any):
             def ll_stream_coarse(loader: neon.Loader):
@@ -771,68 +727,6 @@ def cl_stream_coarse(index: Any):
 
             return ll_stream_coarse
 
-        @neon.Container.factory(name="stream_coarse_step_B")
-        def stream_coarse_step_B(
-            level: int,
-            f_0_fd: Any,
-            f_1_fd: Any,
-            bc_mask_fd: Any,
-            missing_mask_fd: Any,
-            omega: Any,
-        ):
-            def ll_stream_coarse(loader: neon.Loader):
-                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
-                coalescence_factor_fd = omega
-                f_0_pn = loader.get_mres_read_handle(f_0_fd)
-                f_1_pn = loader.get_mres_write_handle(f_1_fd)
-
-                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
-                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
-                coalescence_factor_pn = loader.get_mres_read_handle(coalescence_factor_fd)
-
-                _c = self.velocity_set.c
-                _w = self.velocity_set.w
-
-                @wp.func
-                def cl_stream_coarse(index: Any):
-                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    if _boundary_id == wp.uint8(BC_SOLID):
-                        return
-
-                    if wp.neon_has_child(f_0_pn, index):
-                        # we are a halo cell so we just exit
-                        return
-
-                    for l in range(self.velocity_set.q):
-                        if l == lattice_central_index:
-                            continue
-
-                        pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
-
-                        has_ngh_at_same_level = wp.bool(False)
-                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
-
-                        if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
-                            if not has_ngh_at_same_level:
-                                if wp.neon_has_parent(f_0_pn, index):
-                                    has_a_coarser_ngh = wp.bool(False)
-                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(
-                                        f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
-                                    )
-                                    if has_a_coarser_ngh:
-                                        # Explosion
-                                        wp.neon_write(f_1_pn, index, l, exploded_pop)
-                        else:
-                            if has_ngh_at_same_level:
-                                # Coalescence
-                                coalescence_factor = wp.neon_read(coalescence_factor_pn, index, l)
-                                accumulated = accumulated * coalescence_factor
-                                wp.neon_write(f_1_pn, index, l, accumulated)
-
-                loader.declare_kernel(cl_stream_coarse)
-
-            return ll_stream_coarse
-
         @wp.func
         def neon_stream_finest_with_explosion(
             index: Any,
@@ -1030,60 +924,9 @@ def device(index: Any):
 
             return ll
 
-        @neon.Container.factory(name="stream_coarse_step_C")
-        def stream_coarse_step_C(
-            level: int,
-            f_0_fd: Any,
-            f_1_fd: Any,
-            bc_mask_fd: Any,
-            missing_mask_fd: Any,
-            omega: Any,
-            timestep: int,
-        ):
-            def ll_stream_coarse(loader: neon.Loader):
-                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
-
-                f_0_pn = loader.get_mres_read_handle(f_0_fd)
-                f_1_pn = loader.get_mres_write_handle(f_1_fd)
-
-                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
-                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
-
-                _c = self.velocity_set.c
-
-                @wp.func
-                def cl_stream_coarse(index: Any):
-                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
-                    if _boundary_id == wp.uint8(BC_SOLID):
-                        return
-
-                    if wp.neon_has_child(f_0_pn, index):
-                        # we are a halo cell so we just exit
-                        return
-
-                    _missing_mask = _missing_mask_vec()
-                    # do stream normally
-                    _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
-                    _f1_thread, _missing_mask = neon_get_thread_data(f_1_pn, missing_mask_pn, index)
-                    _f_post_collision = _f0_thread
-                    _f_post_stream = _f1_thread
-
-                    # do non mres post-streaming corrections
-                    _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
-
-                    for l in range(self.velocity_set.q):
-                        wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
-
-                loader.declare_kernel(cl_stream_coarse)
-
-            return ll_stream_coarse
-
         return None, {
             "collide_coarse": collide_coarse,
             "stream_coarse_step_ABC": stream_coarse_step_ABC,
-            "stream_coarse_step_A": stream_coarse_step_A,
-            "stream_coarse_step_B": stream_coarse_step_B,
-            "stream_coarse_step_C": stream_coarse_step_C,
             "finest_fused_pull": finest_fused_pull,
             "CFV_finest_fused_pull": CFV_finest_fused_pull,
             "SFV_finest_fused_pull": SFV_finest_fused_pull,

From d9cb382f1c42357c7a281b68b37334009b661ea8 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Tue, 10 Mar 2026 14:51:23 -0400
Subject: [PATCH 203/208] Renamed functions in `nse_multires_stepper.py` for
 improved clarity.

---
 xlb/operator/stepper/nse_multires_stepper.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 3894eb7c..1d75f9fc 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -358,7 +358,7 @@ def neon_apply_aux_recovery_bc(
                                 wp.neon_write(f_0_pn, index, _opp_indices[l], self.store_dtype(_f1_thread))
 
         @wp.func
-        def neon_collide_and_write(
+        def neon_collide_pipeline(
             index: Any,
             timestep: Any,
             _boundary_id: Any,
@@ -394,7 +394,7 @@ def neon_collide_and_write(
             return _f_post_collision
 
         @wp.func
-        def neon_stream_with_mres(
+        def neon_stream_explode_coalesce(
             index: Any,
             f_0_pn: Any,
             coalescence_factor_pn: Any,
@@ -447,7 +447,7 @@ def device(index: Any):
                         return
                     if not wp.neon_has_child(f_0_pn, index):
                         _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
-                        neon_collide_and_write(
+                        neon_collide_pipeline(
                             index,
                             timestep,
                             _boundary_id,
@@ -487,7 +487,7 @@ def device(index: Any):
                     if _boundary_id != wp.uint8(BC_SFV):
                         return
                     _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
-                    neon_collide_and_write(
+                    neon_collide_pipeline(
                         index,
                         0,
                         _boundary_id,
@@ -532,7 +532,7 @@ def device(index: Any):
                         return
                     if not wp.neon_has_child(f_0_pn, index):
                         _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
-                        neon_collide_and_write(
+                        neon_collide_pipeline(
                             index,
                             timestep,
                             _boundary_id,
@@ -575,7 +575,7 @@ def device(index: Any):
 
                     _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
                     _f_post_collision = _f0_thread
-                    _f_post_stream = neon_stream_with_mres(index, f_0_pn, coalescence_factor_pn)
+                    _f_post_stream = neon_stream_explode_coalesce(index, f_0_pn, coalescence_factor_pn)
 
                     _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
                     neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
@@ -611,7 +611,7 @@ def device(index: Any):
 
                     _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
                     _f_post_collision = _f0_thread
-                    _f_post_stream = neon_stream_with_mres(index, f_0_pn, coalescence_factor_pn)
+                    _f_post_stream = neon_stream_explode_coalesce(index, f_0_pn, coalescence_factor_pn)
 
                     _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
                     neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
@@ -798,7 +798,7 @@ def device(index: Any):
                     # Post-streaming boundary conditions
                     _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
 
-                    neon_collide_and_write(
+                    neon_collide_pipeline(
                         index,
                         timestep,
                         _boundary_id,
@@ -864,7 +864,7 @@ def device(index: Any):
                     # Post-streaming boundary conditions
                     _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
 
-                    neon_collide_and_write(
+                    neon_collide_pipeline(
                         index,
                         timestep,
                         _boundary_id,
@@ -904,7 +904,7 @@ def device(index: Any):
                         return
                     _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
                     _f_post_stream = self.stream.neon_functional(f_0_pn, index)
-                    neon_collide_and_write(
+                    neon_collide_pipeline(
                         index,
                         0,
                         _boundary_id,

From fe19b6579a1d8fad526d6e0fd8d7a167ce4f8093 Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Wed, 11 Mar 2026 13:19:07 +0100
Subject: [PATCH 204/208] Cleaning up multi-res stepper.

---
 requirements.txt                             |   1 +
 xlb/operator/stepper/nse_multires_stepper.py | 283 ++++++++++++-------
 2 files changed, 189 insertions(+), 95 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index d379f30c..a3e9bbcb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,3 +11,4 @@ nvtx
 pytest
 ruff
 usd-core
+h5py
\ No newline at end of file
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 1d75f9fc..f969b97f 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -1,4 +1,70 @@
-# Base class for all multires stepper operators
+"""
+Multi-Resolution Navier-Stokes Stepper for the NEON Backend
+
+This module implements the multi-resolution LBM stepper using Warp kernels on the
+Neon multi-GPU runtime. It uses several programming patterns specific to Warp's
+compile-time code generation model.
+
+Compile-Time Specialization Pattern
+-----------------------------------
+Warp's @wp.func decorator traces Python code at kernel compilation time, not runtime.
+This means runtime boolean parameters cause Warp to emit branching code for both paths,
+increasing register pressure even when only one path is ever taken.
+
+To generate optimized, branch-free kernels, we use a **factory pattern** that captures
+boolean configuration at function-definition time:
+
+    def make_specialized_func(do_feature: bool):
+        @wp.func
+        def impl(...):
+            if wp.static(do_feature):  # Evaluated at compile time
+                # This code is only emitted when do_feature=True
+                ...
+            else:
+                # This code is only emitted when do_feature=False
+                ...
+        return impl
+
+    # Generate specialized variants
+    func_with_feature = make_specialized_func(do_feature=True)
+    func_without_feature = make_specialized_func(do_feature=False)
+
+The `wp.static()` call evaluates its argument during Warp's tracing phase. Since
+`do_feature` is a Python bool captured in the closure, Warp sees a constant and
+eliminates the dead branch entirely.
+
+This pattern is used for:
+- `apply_bc_post_streaming` / `apply_bc_post_collision`: Specialized BC application
+  for streaming vs collision implementation steps
+- `collide_bc_accum` / `collide_simple`: Collision pipeline variants with/without
+  BC application and multi-resolution accumulation
+
+Closure Capture for Self Attributes
+-----------------------------------
+Warp cannot resolve `self.X` in plain assignments inside @wp.func bodies (e.g.,
+`_c = self.velocity_set.c` fails with "Invalid external reference type"). However,
+it can resolve `self.X` in:
+- Function call contexts: `self.stream.neon_functional(...)`
+- Range arguments: `range(self.velocity_set.q)`
+- Type casts: `self.compute_dtype(0)`
+
+For other uses, we pre-capture attributes at the Python level before defining the
+@wp.func, making them available as simple closure variables:
+
+    _c = self.velocity_set.c  # Captured in Python scope
+
+    @wp.func
+    def my_kernel(...):
+        # Use _c directly — Warp sees it as a closure variable
+        direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), ...)
+
+Cell Type Constants
+-------------------
+Cell types are defined in `xlb.cell_type`:
+- BC_SFV (254): Simple Fluid Voxel — no BC, no explosion/coalescence
+- BC_SOLID (255): Solid obstacle voxel
+- BC_NONE (0): Regular fluid voxel with potential BCs or multi-res interactions
+"""
 import nvtx
 import warp as wp
 import neon
@@ -281,36 +347,42 @@ def _construct_neon(self):
             if bc_name.startswith("ExtrapolationOutflowBC"):
                 extrapolation_outflow_bc_ids.append(bc_id)
 
-        @wp.func
-        def apply_bc(
-            index: Any,
-            timestep: Any,
-            _boundary_id: Any,
-            _missing_mask: Any,
-            f_0: Any,
-            f_1: Any,
-            f_pre: Any,
-            f_post: Any,
-            is_post_streaming: bool,
-        ):
-            f_result = f_post
-
-            # Unroll the loop over boundary conditions
-            for i in range(wp.static(len(self.boundary_conditions))):
-                if is_post_streaming:
-                    if wp.static(self.boundary_conditions[i].implementation_step == ImplementationStep.STREAMING):
-                        if _boundary_id == wp.static(self.boundary_conditions[i].id):
-                            f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep, _missing_mask, f_0, f_1, f_pre, f_post)
-                else:
-                    if wp.static(self.boundary_conditions[i].implementation_step == ImplementationStep.COLLISION):
-                        if _boundary_id == wp.static(self.boundary_conditions[i].id):
-                            f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep, _missing_mask, f_0, f_1, f_pre, f_post)
-                    if wp.static(self.boundary_conditions[i].id in extrapolation_outflow_bc_ids):
-                        if _boundary_id == wp.static(self.boundary_conditions[i].id):
-                            f_result = wp.static(self.boundary_conditions[i].assemble_auxiliary_data)(
-                                index, timestep, _missing_mask, f_0, f_1, f_pre, f_post
-                            )
-            return f_result
+        # Factory for apply_bc: generates compile-time specialized variants
+        def make_apply_bc(is_post_streaming: bool):
+            @wp.func
+            def apply_bc_impl(
+                index: Any,
+                timestep: Any,
+                _boundary_id: Any,
+                _missing_mask: Any,
+                f_0: Any,
+                f_1: Any,
+                f_pre: Any,
+                f_post: Any,
+            ):
+                f_result = f_post
+
+                for i in range(wp.static(len(self.boundary_conditions))):
+                    if wp.static(is_post_streaming):
+                        if wp.static(self.boundary_conditions[i].implementation_step == ImplementationStep.STREAMING):
+                            if _boundary_id == wp.static(self.boundary_conditions[i].id):
+                                f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep, _missing_mask, f_0, f_1, f_pre, f_post)
+                    else:
+                        if wp.static(self.boundary_conditions[i].implementation_step == ImplementationStep.COLLISION):
+                            if _boundary_id == wp.static(self.boundary_conditions[i].id):
+                                f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep, _missing_mask, f_0, f_1, f_pre, f_post)
+                        if wp.static(self.boundary_conditions[i].id in extrapolation_outflow_bc_ids):
+                            if _boundary_id == wp.static(self.boundary_conditions[i].id):
+                                f_result = wp.static(self.boundary_conditions[i].assemble_auxiliary_data)(
+                                    index, timestep, _missing_mask, f_0, f_1, f_pre, f_post
+                                )
+                return f_result
+
+            return apply_bc_impl
+
+        # Compile-time specialized BC application variants
+        apply_bc_post_streaming = make_apply_bc(is_post_streaming=True)
+        apply_bc_post_collision = make_apply_bc(is_post_streaming=False)
 
         @wp.func
         def neon_get_thread_data(
@@ -357,41 +429,48 @@ def neon_apply_aux_recovery_bc(
                                 _f1_thread = wp.neon_read(f_1_pn, index, _opp_indices[l])
                                 wp.neon_write(f_0_pn, index, _opp_indices[l], self.store_dtype(_f1_thread))
 
-        @wp.func
-        def neon_collide_pipeline(
-            index: Any,
-            timestep: Any,
-            _boundary_id: Any,
-            _missing_mask: Any,
-            f_0_pn: Any,
-            f_1_pn: Any,
-            _f_post_stream: Any,
-            omega: Any,
-            num_levels: int,
-            level: int,
-            accumulation_pn: Any,
-            do_bc: bool,
-            do_accumulation: bool,
-        ):
-            _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
-            _feq = self.equilibrium.neon_functional(_rho, _u)
-            _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, omega)
-
-            if do_bc:
-                _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False)
-                neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
-
-            if do_accumulation:
-                for l in range(self.velocity_set.q):
-                    push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
-                    if level < num_levels - 1:
-                        wp.neon_mres_lbm_store_op(accumulation_pn, index, l, push_direction, _f_post_collision[l])
-                    wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
-            else:
-                for l in range(self.velocity_set.q):
-                    wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+        # Factory for neon_collide_pipeline: generates compile-time specialized variants
+        def make_collide_pipeline(do_bc: bool, do_accumulation: bool):
+            @wp.func
+            def collide_pipeline_impl(
+                index: Any,
+                timestep: Any,
+                _boundary_id: Any,
+                _missing_mask: Any,
+                f_0_pn: Any,
+                f_1_pn: Any,
+                _f_post_stream: Any,
+                omega: Any,
+                num_levels: int,
+                level: int,
+                accumulation_pn: Any,
+            ):
+                _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
+                _feq = self.equilibrium.neon_functional(_rho, _u)
+                _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, omega)
+
+                if wp.static(do_bc):
+                    _f_post_collision = apply_bc_post_collision(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision)
+                    neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
 
-            return _f_post_collision
+                if wp.static(do_accumulation):
+                    for l in range(self.velocity_set.q):
+                        push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
+                        if level < num_levels - 1:
+                            wp.neon_mres_lbm_store_op(accumulation_pn, index, l, push_direction, _f_post_collision[l])
+                        wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+                else:
+                    for l in range(self.velocity_set.q):
+                        wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+
+                return _f_post_collision
+
+            return collide_pipeline_impl
+
+        # Compile-time specialized collision pipeline variants
+        collide_bc_accum = make_collide_pipeline(do_bc=True, do_accumulation=True)
+        collide_bc_only = make_collide_pipeline(do_bc=True, do_accumulation=False)
+        collide_simple = make_collide_pipeline(do_bc=False, do_accumulation=False)
 
         @wp.func
         def neon_stream_explode_coalesce(
@@ -426,7 +505,13 @@ def neon_stream_explode_coalesce(
             return _f_post_stream
 
         @neon.Container.factory(name="collide_coarse")
-        def collide_coarse(level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any, omega: Any, timestep: int):
+        def collide_coarse(level: int,
+                           f_0_fd: Any,
+                           f_1_fd: Any,
+                           bc_mask_fd: Any,
+                           missing_mask_fd: Any,
+                           omega: Any,
+                           timestep: int):
             num_levels = f_0_fd.get_grid().num_levels
 
             def ll(loader: neon.Loader):
@@ -447,7 +532,7 @@ def device(index: Any):
                         return
                     if not wp.neon_has_child(f_0_pn, index):
                         _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
-                        neon_collide_pipeline(
+                        collide_bc_accum(
                             index,
                             timestep,
                             _boundary_id,
@@ -459,8 +544,6 @@ def device(index: Any):
                             num_levels,
                             level,
                             f_1_pn,
-                            True,
-                            True,
                         )
                     else:
                         for l in range(self.velocity_set.q):
@@ -471,7 +554,13 @@ def device(index: Any):
             return ll
 
         @neon.Container.factory(name="SFV_collide_coarse")
-        def SFV_collide_coarse(level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any, omega: Any, timestep: int):
+        def SFV_collide_coarse(level: int,
+                               f_0_fd: Any,
+                               f_1_fd: Any,
+                               bc_mask_fd: Any,
+                               missing_mask_fd: Any,
+                               omega: Any,
+                               timestep: int):
             """Collision on SFV voxels only — no BCs, no multi-resolution accumulation."""
 
             def ll(loader: neon.Loader):
@@ -487,7 +576,7 @@ def device(index: Any):
                     if _boundary_id != wp.uint8(BC_SFV):
                         return
                     _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
-                    neon_collide_pipeline(
+                    collide_simple(
                         index,
                         0,
                         _boundary_id,
@@ -499,8 +588,6 @@ def device(index: Any):
                         0,
                         level,
                         f_1_pn,
-                        False,
-                        False,
                     )
 
                 loader.declare_kernel(device)
@@ -532,7 +619,7 @@ def device(index: Any):
                         return
                     if not wp.neon_has_child(f_0_pn, index):
                         _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
-                        neon_collide_pipeline(
+                        collide_bc_accum(
                             index,
                             timestep,
                             _boundary_id,
@@ -544,8 +631,6 @@ def device(index: Any):
                             num_levels,
                             level,
                             f_1_pn,
-                            True,
-                            True,
                         )
                     else:
                         for l in range(self.velocity_set.q):
@@ -556,7 +641,13 @@ def device(index: Any):
             return ll
 
         @neon.Container.factory(name="stream_coarse_step_ABC")
-        def stream_coarse_step_ABC(level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any, omega: Any, timestep: int):
+        def stream_coarse_step_ABC(level: int,
+                                   f_0_fd: Any,
+                                   f_1_fd: Any,
+                                   bc_mask_fd: Any,
+                                   missing_mask_fd: Any,
+                                   omega: Any,
+                                   timestep: int):
             def ll(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
                 f_0_pn = loader.get_mres_read_handle(f_0_fd)
@@ -577,7 +668,7 @@ def device(index: Any):
                     _f_post_collision = _f0_thread
                     _f_post_stream = neon_stream_explode_coalesce(index, f_0_pn, coalescence_factor_pn)
 
-                    _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
+                    _f_post_stream = apply_bc_post_streaming(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream)
                     neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
 
                     for l in range(self.velocity_set.q):
@@ -588,7 +679,13 @@ def device(index: Any):
             return ll
 
         @neon.Container.factory(name="SFV_stream_coarse_step_ABC")
-        def SFV_stream_coarse_step_ABC(level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any, omega: Any, timestep: int):
+        def SFV_stream_coarse_step_ABC(level: int,
+                                       f_0_fd: Any,
+                                       f_1_fd: Any,
+                                       bc_mask_fd: Any,
+                                       missing_mask_fd: Any,
+                                       omega: Any,
+                                       timestep: int):
             """Stream on CFV voxels only — skips SFV and solid."""
 
             def ll(loader: neon.Loader):
@@ -613,7 +710,7 @@ def device(index: Any):
                     _f_post_collision = _f0_thread
                     _f_post_stream = neon_stream_explode_coalesce(index, f_0_pn, coalescence_factor_pn)
 
-                    _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
+                    _f_post_stream = apply_bc_post_streaming(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream)
                     neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
 
                     for l in range(self.velocity_set.q):
@@ -693,7 +790,11 @@ def cl_stream_coarse(index: Any):
             return ll_stream_coarse
 
         @neon.Container.factory(name="SFV_stream_coarse_step")
-        def SFV_stream_coarse_step(level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any):
+        def SFV_stream_coarse_step(level: int,
+                                   f_0_fd: Any,
+                                   f_1_fd: Any,
+                                   bc_mask_fd: Any,
+                                   missing_mask_fd: Any):
             def ll_stream_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
@@ -742,7 +843,7 @@ def neon_stream_finest_with_explosion(
                 pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
 
                 has_ngh_at_same_level = wp.bool(False)
-                accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
+                wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
 
                 if not has_ngh_at_same_level:
                     if wp.neon_has_parent(f_0_pn, index):
@@ -795,10 +896,9 @@ def device(index: Any):
                     _f_post_collision = _f0_thread
                     _f_post_stream = neon_stream_finest_with_explosion(index, f_0_pn, explosion_src_pn)
 
-                    # Post-streaming boundary conditions
-                    _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
+                    _f_post_stream = apply_bc_post_streaming(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream)
 
-                    neon_collide_pipeline(
+                    collide_bc_accum(
                         index,
                         timestep,
                         _boundary_id,
@@ -810,8 +910,6 @@ def device(index: Any):
                         num_levels,
                         level,
                         accumulation_pn,
-                        True,
-                        True,
                     )
 
                 loader.declare_kernel(device)
@@ -861,10 +959,9 @@ def device(index: Any):
                     _f_post_collision = _f0_thread
                     _f_post_stream = neon_stream_finest_with_explosion(index, f_0_pn, explosion_src_pn)
 
-                    # Post-streaming boundary conditions
-                    _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
+                    _f_post_stream = apply_bc_post_streaming(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream)
 
-                    neon_collide_pipeline(
+                    collide_bc_accum(
                         index,
                         timestep,
                         _boundary_id,
@@ -876,8 +973,6 @@ def device(index: Any):
                         num_levels,
                         level,
                         accumulation_pn,
-                        True,
-                        True,
                     )
 
                 loader.declare_kernel(device)
@@ -904,7 +999,7 @@ def device(index: Any):
                         return
                     _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
                     _f_post_stream = self.stream.neon_functional(f_0_pn, index)
-                    neon_collide_pipeline(
+                    collide_simple(
                         index,
                         0,
                         _boundary_id,
@@ -916,8 +1011,6 @@ def device(index: Any):
                         0,
                         0,
                         f_1_pn,
-                        False,
-                        False,
                     )
 
                 loader.declare_kernel(device)

From 71e3cf87fb347cc4f3c2b5d3f73baa20a935f50d Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Wed, 11 Mar 2026 16:45:19 -0400
Subject: [PATCH 205/208] Documentation

---
 README.md                                     |  57 ++++--
 .../grid_refinement/flow_past_sphere_3d.py    | 173 ------------------
 ..._3d.py => multires_flow_past_sphere_3d.py} |  33 ++--
 .../ahmed.py => multires_windtunnel_3d.py}    |  19 +-
 examples/cfd/rotating_sphere_3d.py            |  12 +-
 examples/performance/mlups_3d_multires.py     | 115 +++++++-----
 xlb/compute_backend.py                        |  11 +-
 xlb/default_config.py                         |  38 ++++
 xlb/grid/grid.py                              |  63 +++++++
 xlb/grid/multires_grid.py                     |  63 ++++++-
 xlb/grid/neon_grid.py                         |  49 ++++-
 xlb/helper/initializers.py                    |  99 +++++++++-
 xlb/helper/nse_fields.py                      |   8 +
 xlb/helper/simulation_manager.py              |  58 +++++-
 xlb/mres_perf_optimization_type.py            |   7 +
 .../boundary_condition/bc_do_nothing.py       |   5 +-
 .../boundary_condition/bc_equilibrium.py      |  16 +-
 .../bc_extrapolation_outflow.py               |  11 +-
 .../bc_fullway_bounce_back.py                 |   5 +-
 .../bc_halfway_bounce_back.py                 |   7 +-
 xlb/operator/boundary_condition/bc_hybrid.py  |  42 +++++
 .../boundary_condition/bc_regularized.py      |  10 +-
 xlb/operator/boundary_condition/bc_zouhe.py   |  11 +-
 .../boundary_condition/boundary_condition.py  |  33 +++-
 .../boundary_condition/helper_functions_bc.py |  25 +++
 xlb/operator/boundary_masker/aabb.py          |   7 +
 xlb/operator/boundary_masker/aabb_close.py    |  29 ++-
 .../helper_functions_masker.py                |  11 +-
 .../indices_boundary_masker.py                |   7 +
 .../boundary_masker/mesh_boundary_masker.py   |   9 +-
 .../mesh_voxelization_method.py               |  35 +++-
 xlb/operator/boundary_masker/multires_aabb.py |   4 +
 .../boundary_masker/multires_aabb_close.py    |   7 +
 .../multires_indices_boundary_masker.py       |   7 +
 xlb/operator/boundary_masker/multires_ray.py  |   4 +
 xlb/operator/boundary_masker/ray.py           |   9 +-
 xlb/operator/boundary_masker/winding.py       |  10 +-
 xlb/operator/collision/bgk.py                 |  14 +-
 xlb/operator/collision/forced_collision.py    |  19 +-
 xlb/operator/collision/smagorinsky_les_bgk.py |  44 ++---
 .../multires_quadratic_equilibrium.py         |  12 +-
 .../force/multires_momentum_transfer.py       |  24 ++-
 .../macroscopic/multires_macroscopic.py       |  11 +-
 xlb/operator/operator.py                      |  49 +++++
 xlb/operator/stepper/nse_multires_stepper.py  | 164 ++++++++++++-----
 xlb/operator/stepper/nse_stepper.py           |  32 +++-
 xlb/operator/stream/stream.py                 |  17 +-
 xlb/precision_policy.py                       |  17 +-
 xlb/utils/mesher.py                           |  45 +++++
 xlb/utils/utils.py                            |  17 +-
 xlb/velocity_set/velocity_set.py              |   8 +-
 51 files changed, 1205 insertions(+), 377 deletions(-)
 delete mode 100644 examples/cfd/grid_refinement/flow_past_sphere_3d.py
 rename examples/cfd/{grid_refinement/cuboid_flow_past_sphere_3d.py => multires_flow_past_sphere_3d.py} (91%)
 rename examples/cfd/{grid_refinement/ahmed.py => multires_windtunnel_3d.py} (96%)

diff --git a/README.md b/README.md
index 97b2082a..f383fb20 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
 
 # XLB: A Differentiable Massively Parallel Lattice Boltzmann Library in Python for Physics-Based Machine Learning
 
-XLB is a fully differentiable 2D/3D Lattice Boltzmann Method (LBM) library that leverages hardware acceleration. It supports [JAX](https://github.com/google/jax) and [NVIDIA Warp](https://github.com/NVIDIA/warp) backends, and is specifically designed to solve fluid dynamics problems in a computationally efficient and differentiable manner. Its unique combination of features positions it as an exceptionally suitable tool for applications in physics-based machine learning. With the new Warp backend, XLB now offers state-of-the-art performance for even faster simulations.
+XLB is a fully differentiable 2D/3D Lattice Boltzmann Method (LBM) library that leverages hardware acceleration. It supports [JAX](https://github.com/google/jax), [NVIDIA Warp](https://github.com/NVIDIA/warp), and [Neon](https://github.com/Autodesk/Neon) backends, and is specifically designed to solve fluid dynamics problems in a computationally efficient and differentiable manner. Its unique combination of features positions it as an exceptionally suitable tool for applications in physics-based machine learning. With the Warp backend, XLB offers state-of-the-art single-GPU performance, and with the new Neon backend it extends to multi-GPU (single-resolution). More importantly, the Neon backend provides grid refinement capabilities for multi-resolution simulations.
 
 ## Getting Started
 To get started with XLB, you can install it using pip. There are different installation options depending on your hardware and needs:
@@ -63,11 +63,36 @@ If you use XLB in your research, please cite the following paper:
 }
 ```
 
+If you use the grid refinement capabilities in your work, please also cite:
+
+```
+@inproceedings{mahmoud2024optimized,
+  title={Optimized {GPU} implementation of grid refinement in lattice {Boltzmann} method},
+  author={Mahmoud, Ahmed H and Salehipour, Hesam and Meneghin, Massimiliano},
+  booktitle={2024 IEEE International Parallel and Distributed Processing Symposium (IPDPS)},
+  pages={398--407},
+  year={2024},
+  organization={IEEE}
+}
+
+@inproceedings{meneghin2022neon,
+  title={Neon: A Multi-{GPU} Programming Model for Grid-based Computations},
+  author={Meneghin, Massimiliano and Mahmoud, Ahmed H. and Jayaraman, Pradeep Kumar and Morris, Nigel J. W.},
+  booktitle={Proceedings of the 36th IEEE International Parallel and Distributed Processing Symposium},
+  pages={817--827},
+  year={2022},
+  month={june},
+  doi={10.1109/IPDPS53621.2022.00084},
+  url={https://escholarship.org/uc/item/9fz7k633}
+}
+```
+
 ## Key Features
-- **Multiple Backend Support:** XLB now includes support for multiple backends including JAX and NVIDIA Warp, providing *state-of-the-art* performance for lattice Boltzmann simulations. Currently, only single GPU is supported for the Warp backend.
+- **Multiple Backend Support:** XLB includes support for JAX, NVIDIA Warp, and Neon backends, providing *state-of-the-art* performance for lattice Boltzmann simulations. The Warp backend targets single-GPU runs, while the Neon backend enables multi-GPU single-resolution and single-GPU multi-resolution simulations.
+- **Multi-Resolution Grid Refinement:** Mesh refinement with nested cuboid grids and multiple kernel-fusion strategies for optimal performance on the Neon backend.
 - **Integration with JAX Ecosystem:** The library can be easily integrated with JAX's robust ecosystem of machine learning libraries such as [Flax](https://github.com/google/flax), [Haiku](https://github.com/deepmind/dm-haiku), [Optax](https://github.com/deepmind/optax), and many more.
 - **Differentiable LBM Kernels:** XLB provides differentiable LBM kernels that can be used in differentiable physics and deep learning applications.
-- **Scalability:** XLB is capable of scaling on distributed multi-GPU systems using the JAX backend, enabling the execution of large-scale simulations on hundreds of GPUs with billions of cells.
+- **Scalability:** XLB is capable of scaling on distributed multi-GPU systems using the JAX backend or the Neon backend, enabling the execution of large-scale simulations on hundreds of GPUs with billions of cells.
 - **Support for Various LBM Boundary Conditions and Kernels:** XLB supports several LBM boundary conditions and collision kernels.
 - **User-Friendly Interface:** Written entirely in Python, XLB emphasizes a highly accessible interface that allows users to extend the library with ease and quickly set up and run new simulations.
 - **Leverages JAX Array and Shardmap:** The library incorporates the new JAX array unified array type and JAX shardmap, providing users with a numpy-like interface. This allows users to focus solely on the semantics, leaving performance optimizations to the compiler.
@@ -103,7 +128,7 @@ If you use XLB in your research, please cite the following paper:
   <img src="https://raw.githubusercontent.com/autodesk/xlb/main/assets/building.png" alt="" width="700">
 </p>
 <p align="center">
-  Airflow in to, out of, and within a building (~400 million cells)
+  Airflow into, out of, and within a building (~400 million cells)
 </p>
 
 <p align="center">
@@ -128,6 +153,7 @@ The stages of a fluid density field from an initial state to the emergence of th
 
 - BGK collision model (Standard LBM collision model)
 - KBC collision model (unconditionally stable for flows with high Reynolds number)
+- Smagorinsky LES sub-grid model for turbulence modelling
 
 ### Machine Learning
 
@@ -143,21 +169,25 @@ The stages of a fluid density field from an initial state to the emergence of th
 
 ### Compute Capabilities
 - Single GPU support for the Warp backend with state-of-the-art performance
+- Multi-GPU support using the Neon backend with single-resolution grids
+- Grid refinement support on single-GPU using the Neon backend
 - Distributed Multi-GPU support using the JAX backend
 - Mixed-Precision support (store vs compute)
+- Multiple kernel-fusion performance strategies for multi-resolution simulations
 - Out-of-core support (coming soon)
 
 ### Output
 
 - Binary and ASCII VTK output (based on PyVista library)
+- HDF5/XDMF output for multi-resolution data (with gzip compression)
 - In-situ rendering using [PhantomGaze](https://github.com/loliverhennigh/PhantomGaze) library
 - [Orbax](https://github.com/google/orbax)-based distributed asynchronous checkpointing
-- Image Output
+- Image Output (including multi-resolution slice images)
 - 3D mesh voxelizer using trimesh
 
 ### Boundary conditions
 
-- **Equilibrium BC:** In this boundary condition, the fluid populations are assumed to be in at equilibrium. Can be used to set prescribed velocity or pressure.
+- **Equilibrium BC:** In this boundary condition, the fluid populations are assumed to be at equilibrium. Can be used to set prescribed velocity or pressure.
 
 - **Full-Way Bounceback BC:** In this boundary condition, the velocity of the fluid populations is reflected back to the fluid side of the boundary, resulting in zero fluid velocity at the boundary.
 
@@ -171,17 +201,22 @@ The stages of a fluid density field from an initial state to the emergence of th
 
 - **Interpolated Bounceback BC:** Interpolated bounce-back boundary condition for representing curved boundaries.
 
+- **Hybrid BC:** Combines regularized and bounce-back methods with optional wall-distance interpolation for improved accuracy on curved geometries.
+
+- **Grad's Approximation BC:** Boundary condition based on Grad's approximation of the non-equilibrium distribution.
+
 ## Roadmap
 
-### Work in Progress (WIP)
-*Note: Some of the work-in-progress features can be found in the branches of the XLB repository. For contributions to these features, please reach out.*
+### Recently Completed
 
- - 🌐 **Grid Refinement:** Implementing adaptive mesh refinement techniques for enhanced simulation accuracy.
+ - ✅ **Grid Refinement:** Multi-resolution LBM with nested cuboid grids and multiple kernel-fusion strategies via the Neon backend.
 
- - 💾 **Out-of-Core Computations:** Enabling simulations that exceed available GPU memory, suitable for CPU+GPU coherent memory models such as NVIDIA's Grace Superchips (coming soon).
+ - ✅ **Multi-GPU Acceleration using [Neon](https://github.com/Autodesk/Neon) + Warp:** Multi-GPU support through Neon's data structures with Warp-based kernels for single-resolution settings.
 
+### Work in Progress (WIP)
+*Note: Some of the work-in-progress features can be found in the branches of the XLB repository. For contributions to these features, please reach out.*
 
-- ⚡ **Multi-GPU Acceleration using [Neon](https://github.com/Autodesk/Neon) + Warp:** Using Neon's data structure for improved scaling.
+ - 💾 **Out-of-Core Computations:** Enabling simulations that exceed available GPU memory, suitable for CPU+GPU coherent memory models such as NVIDIA's Grace Superchips (coming soon).
 
 - 🗜️ **GPU Accelerated Lossless Compression and Decompression**: Implementing high-performance lossless compression and decompression techniques for larger-scale simulations and improved performance.
 
diff --git a/examples/cfd/grid_refinement/flow_past_sphere_3d.py b/examples/cfd/grid_refinement/flow_past_sphere_3d.py
deleted file mode 100644
index 30738351..00000000
--- a/examples/cfd/grid_refinement/flow_past_sphere_3d.py
+++ /dev/null
@@ -1,173 +0,0 @@
-import xlb
-from xlb.compute_backend import ComputeBackend
-from xlb.precision_policy import PrecisionPolicy
-from xlb.grid import multires_grid_factory
-from xlb.operator.boundary_condition import FullwayBounceBackBC, HalfwayBounceBackBC, RegularizedBC, ExtrapolationOutflowBC, DoNothingBC, ZouHeBC
-import neon
-import warp as wp
-import numpy as np
-import time
-
-# -------------------------- Simulation Setup --------------------------
-
-Re = 500.0
-grid_shape = (512 // 2, 128 // 2, 128 // 2)
-compute_backend = ComputeBackend.NEON
-precision_policy = PrecisionPolicy.FP32FP32
-velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, compute_backend=compute_backend)
-u_max = 0.04
-num_steps = 10000
-post_process_interval = 1000
-
-# Initialize XLB
-xlb.init(
-    velocity_set=velocity_set,
-    default_backend=compute_backend,
-    default_precision_policy=precision_policy,
-)
-
-# Create the multires grid
-nx, ny, nz = grid_shape
-sphere_origin = (nx // 6, ny // 2, nz // 2)
-sphere_radius = min(nx, ny, nz) // 12  # Radius of the sphere
-num_levels = 3
-level_origins = []
-level_list = []
-for lvl in range(num_levels):
-    divider = 2**lvl
-    growth = 1.25**lvl
-    shape = nx // divider, ny // divider, nz // divider
-    if lvl == num_levels - 1:
-        level = np.ascontiguousarray(np.ones(shape, dtype=int), dtype=np.int32)
-        box_origin = np.array([0, 0, 0])  # The coarsest level has no origin offset
-    else:
-        box_size = tuple([int(0.3 * shape[i] * growth) for i in range(3)])
-        level = np.ascontiguousarray(np.ones(box_size, dtype=int), dtype=np.int32)
-        if lvl == 0:
-            box_origin = tuple(
-                [sphere_origin[0] // divider - int(2 * growth * sphere_radius // divider)] + [shape[i] // 2 - box_size[i] // 2 for i in range(1, 3)]
-            )
-        else:
-            finer_box_size = level_list[-1].shape
-            finer_box_origin = np.array(level_origins[-1])
-            shift = np.array(box_size) - np.array(finer_box_size) // 2
-            box_origin = finer_box_origin // 2 - shift // 2
-    level_list.append(level)
-    level_origins.append(box_origin)
-
-
-# Note that this exporter does not produce expected results at the moment because the level_list
-# produced above include dense fields and are not sparse.
-
-# # Define exporter object for hdf5 output
-# from xlb.utils import MultiresIO
-
-# # Pack the needed information for the exporter in a list called "level_data"
-# level_data = []
-# for level in range(num_levels):
-#     voxel_size = 2**level
-#     level_data.append(
-#         [level_list[level].astype(bool), voxel_size, level_origins[level], level],
-#     )
-# exporter = MultiresIO({"velocity": 3, "density": 1}, level_data)
-
-# Create the multires grid
-grid = multires_grid_factory(
-    grid_shape,
-    velocity_set=velocity_set,
-    sparsity_pattern_list=level_list,
-    sparsity_pattern_origins=[neon.Index_3d(*origin) for origin in level_origins],
-)
-
-# Define Boundary Indices
-coarsest_level = grid.count_levels - 1
-box = grid.bounding_box_indices(shape=grid.level_to_shape(coarsest_level))
-box_no_edge = grid.bounding_box_indices(shape=grid.level_to_shape(coarsest_level), remove_edges=True)
-inlet = box_no_edge["left"]
-outlet = box_no_edge["right"]
-walls = [box["bottom"][i] + box["top"][i] + box["front"][i] + box["back"][i] for i in range(velocity_set.d)]
-walls = np.unique(np.array(walls), axis=-1).tolist()
-
-# sphere at the finest level
-x = np.arange(nx)
-y = np.arange(ny)
-z = np.arange(nz)
-X, Y, Z = np.meshgrid(x, y, z, indexing="ij")
-indices = np.where((X - sphere_origin[0]) ** 2 + (Y - sphere_origin[1]) ** 2 + (Z - sphere_origin[2]) ** 2 < sphere_radius**2)
-sphere = [tuple(indices[i]) for i in range(velocity_set.d)]
-
-# Convert bc indices to a list of list (first entry corresponds to the finest level)
-inlet = [[] for _ in range(num_levels - 1)] + [inlet]
-outlet = [[] for _ in range(num_levels - 1)] + [outlet]
-walls = [[] for _ in range(num_levels - 1)] + [walls]
-sphere = [sphere] + [[] for _ in range(num_levels - 1)]
-
-
-# Define Boundary Conditions
-def bc_profile():
-    assert compute_backend == ComputeBackend.NEON
-
-    # Note nx, ny, nz are the dimensions of the grid at the finest level while the inlet is defined at the coarsest level
-    _, ny, nz = grid_shape
-    dtype = precision_policy.compute_precision.wp_dtype
-    H_y = dtype(ny // 2 ** (num_levels - 1) - 1)  # Height in y direction
-    H_z = dtype(nz // 2 ** (num_levels - 1) - 1)  # Height in z direction
-    two = dtype(2.0)
-    u_max_wp = dtype(u_max)
-
-    @wp.func
-    def bc_profile_warp(index: wp.vec3i):
-        # Poiseuille flow profile: parabolic velocity distribution
-        y = dtype(index[1])
-        z = dtype(index[2])
-
-        # Calculate normalized distance from center
-        y_center = y - (H_y / two)
-        z_center = z - (H_z / two)
-        r_squared = (two * y_center / H_y) ** two + (two * z_center / H_z) ** two
-
-        # Parabolic profile: u = u_max * (1 - r²)
-        return wp.vec(u_max_wp * wp.max(dtype(0.0), dtype(1.0) - r_squared), length=1)
-
-    return bc_profile_warp
-
-
-# Initialize Boundary Conditions
-bc_left = RegularizedBC("velocity", profile=bc_profile(), indices=inlet)
-# Alternatively, use a prescribed velocity profile
-# bc_left = RegularizedBC("velocity", prescribed_value=(u_max, 0.0, 0.0), indices=inlet)
-bc_walls = FullwayBounceBackBC(indices=walls)
-# bc_outlet = ExtrapolationOutflowBC(indices=outlet)
-bc_outlet = DoNothingBC(indices=outlet)
-bc_sphere = HalfwayBounceBackBC(indices=sphere)
-boundary_conditions = [bc_walls, bc_left, bc_outlet, bc_sphere]
-
-
-# Configure the simulation relaxation time
-visc = 2.0 * u_max * sphere_radius / Re
-omega_finest = 1.0 / (3.0 * visc + 0.5)
-
-# Define a multi-resolution simulation manager
-sim = xlb.helper.MultiresSimulationManager(
-    omega_finest=omega_finest,
-    grid=grid,
-    boundary_conditions=boundary_conditions,
-    collision_type="BGK",
-)
-
-# -------------------------- Simulation Loop --------------------------
-
-wp.synchronize()
-start_time = time.time()
-for step in range(num_steps):
-    sim.step()
-
-    if step % post_process_interval == 0 or step == num_steps - 1:
-        # TODO: Issues in the vtk output for rectangular cuboids (as if a cuboid grid with the largest side is assumed)
-        wp.synchronize()
-        sim.export_macroscopic("multires_flow_over_sphere_3d_")
-        wp.synchronize()
-        end_time = time.time()
-        elapsed = end_time - start_time
-        print(f"Completed step {step}. Time elapsed for {post_process_interval} steps: {elapsed:.6f} seconds.")
-        start_time = time.time()
diff --git a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py b/examples/cfd/multires_flow_past_sphere_3d.py
similarity index 91%
rename from examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
rename to examples/cfd/multires_flow_past_sphere_3d.py
index baaa72e3..ea510a29 100644
--- a/examples/cfd/grid_refinement/cuboid_flow_past_sphere_3d.py
+++ b/examples/cfd/multires_flow_past_sphere_3d.py
@@ -1,3 +1,12 @@
+"""
+3D fow past a sphere with multi-resolution LBM.
+
+Demonstrates the multi-resolution Neon backend for a 3-D Poiseuille-
+inlet flow past a sphere inside a nested cuboid multi-resolution domain.
+Uses AABB-Close voxelization with halfway bounce-back on the sphere surface and
+computes lift/drag via momentum transfer.
+"""
+
 import neon
 import warp as wp
 import numpy as np
@@ -32,10 +41,10 @@ def generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part):
     # First entry should be full domain size
     # Domain multipliers
     domainMultiplier = [
-        [15, 15, 7, 7, 7, 7],  # -x, x, -y, y, -z, z
-        [6, 8, 5, 5, 5, 5],  # -x, x, -y, y, -z, z
-        [4, 6, 4, 4, 4, 4],
-        [2, 4, 2, 2, 2, 2],
+        [7, 22, 7, 7, 7, 7],  # -x, x, -y, y, -z, z  (sphere at 1/4 domain from inlet)
+        [3, 12, 5, 5, 5, 5],  # -x, x, -y, y, -z, z  (wake-biased)
+        [2, 8, 4, 4, 4, 4],
+        [1, 5, 2, 2, 2, 2],
         # [1, 2, 1, 1, 1, 1],
         # [0.4, 1, 0.4, 0.4, 0.4, 0.4],
         # [0.2, 0.4, 0.2, 0.2, 0.2, 0.2],
@@ -105,7 +114,7 @@ def generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part):
 )
 
 # Generate the cuboid mesh and sphere vertices
-stl_filename = "examples/cfd/stl-files/sphere.stl"
+stl_filename = "../stl-files/sphere.stl"
 level_data, sphere, grid_shape_finest = generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part)
 
 
@@ -141,6 +150,7 @@ def generate_cuboid_mesh(stl_filename, num_finest_voxels_across_part):
 
 # Define Boundary Conditions
 def bc_profile():
+    """Build a Warp function for a Poiseuille parabolic inlet velocity profile."""
     assert compute_backend == ComputeBackend.NEON
     # IMPORTANT NOTE: the user defined functional must be defined in terms of the indices at the finest level
     _, ny, nz = grid_shape_finest
@@ -183,17 +193,16 @@ def bc_profile_warp(index: wp.vec3i):
 
 # Initialize Boundary Conditions
 bc_left = RegularizedBC("velocity", profile=bc_profile(), indices=inlet)
+# Alternatives:
 # bc_left = HybridBC(bc_method="bounceback_regularized", profile=bc_profile(), indices=inlet)
-# Alternatively, use a prescribed velocity profile
 # bc_left = RegularizedBC("velocity", prescribed_value=(u_max, 0.0, 0.0), indices=inlet)
 bc_walls = FullwayBounceBackBC(indices=walls)
-# bc_ground = FullwayBounceBackBC(indices=grid.boundary_indices_across_levels(level_data, box_side="front"))
-# bc_outlet = ExtrapolationOutflowBC(indices=outlet)
 bc_outlet = DoNothingBC(indices=outlet)
-# bc_sphere = HalfwayBounceBackBC(mesh_vertices=sphere, voxelization_method=MeshVoxelizationMethod('AABB'))
+# bc_outlet = ExtrapolationOutflowBC(indices=outlet)
 bc_sphere = HybridBC(
     bc_method="nonequilibrium_regularized", mesh_vertices=sphere, voxelization_method=MeshVoxelizationMethod("AABB"), use_mesh_distance=True
 )
+# bc_sphere = HalfwayBounceBackBC(mesh_vertices=sphere, voxelization_method=MeshVoxelizationMethod('AABB'))
 
 boundary_conditions = [bc_walls, bc_left, bc_outlet, bc_sphere]
 
@@ -228,7 +237,7 @@ def bc_profile_warp(index: wp.vec3i):
 
 
 def print_lift_drag(sim):
-    # Compute lift and drag
+    """Compute and print drag and lift coefficients from the simulation state."""
     boundary_force = momentum_transfer(sim.f_0, sim.f_1, sim.bc_mask, sim.missing_mask)
     drag = boundary_force[0]  # x-direction
     lift = boundary_force[2]
@@ -259,7 +268,7 @@ def print_lift_drag(sim):
 
         # Call the exporter to save the current state
         nx, ny, nz = grid_shape_finest
-        filename = f"multires_flow_over_sphere_3d_{step:04d}"
+        filename = f"multires_flow_past_sphere_3d_{step:04d}"
         wp.synchronize()
         exporter.to_hdf5(filename, {"velocity": sim.u, "density": sim.rho}, compression="gzip", compression_opts=2)
         exporter.to_slice_image(
@@ -269,7 +278,7 @@ def print_lift_drag(sim):
             plane_normal=(0, 0, 1),
             grid_res=256,
             slice_thickness=2 ** (num_levels - 1),
-            bounds=(0.4, 0.6, 0.4, 0.6),
+            bounds=(0.1, 0.6, 0.3, 0.7),
         )
 
         # Print lift and drag coefficients
diff --git a/examples/cfd/grid_refinement/ahmed.py b/examples/cfd/multires_windtunnel_3d.py
similarity index 96%
rename from examples/cfd/grid_refinement/ahmed.py
rename to examples/cfd/multires_windtunnel_3d.py
index 59a633fc..ad02a3c5 100644
--- a/examples/cfd/grid_refinement/ahmed.py
+++ b/examples/cfd/multires_windtunnel_3d.py
@@ -1,3 +1,12 @@
+"""
+Ahmed body aerodynamics with multi-resolution LBM.
+
+Simulates turbulent flow around the Ahmed body (25-degree slant angle)
+using the XLB multi-resolution Neon backend.  Computes drag and lift
+coefficients via momentum transfer and exports HDF5/XDMF data for
+post-processing.
+"""
+
 import neon
 import warp as wp
 import numpy as np
@@ -35,7 +44,7 @@
 voxel_size = 0.005  # Finest voxel size in meters
 
 # STL filename
-stl_filename = "examples/cfd/stl-files/Ahmed_25_NoLegs.stl"
+stl_filename = "../stl-files/Ahmed_25_NoLegs.stl"
 script_name = "Ahmed"
 
 # I/O settings
@@ -165,7 +174,7 @@ def initialize_simulation(
 
 # Utility Functions
 # =================
-def print_lift_drag(sim, step, momentum_transfer, wind_speed_lbm, reference_area):
+def compute_force_coefficients(sim, step, momentum_transfer, wind_speed_lbm, reference_area):
     """
     Calculate and print lift and drag coefficients.
     """
@@ -181,7 +190,7 @@ def print_lift_drag(sim, step, momentum_transfer, wind_speed_lbm, reference_area
     return cd, cl, drag
 
 
-def plot_drag_lift(drag_values, output_dir, print_interval, script_name, percentile_range=(15, 85), use_log_scale=False):
+def plot_force_coefficients(drag_values, output_dir, print_interval, script_name, percentile_range=(15, 85), use_log_scale=False):
     """
     Plot CD and CL over time and save the plot to the output directory.
     """
@@ -495,7 +504,7 @@ def _load_sim_line(csv_path):
     if step % print_interval == 0 or step == num_steps - 1:
         sim.macro(sim.f_0, sim.bc_mask, sim.rho, sim.u, streamId=0)
         wp.synchronize()
-        cd, cl, drag = print_lift_drag(sim, step, momentum_transfer, wind_speed_lbm, reference_area)
+        cd, cl, drag = compute_force_coefficients(sim, step, momentum_transfer, wind_speed_lbm, reference_area)
         filename = os.path.join(output_dir, f"{script_name}_{step:04d}")
         h5exporter.to_hdf5(filename, {"velocity": sim.u, "density": sim.rho}, compression="gzip", compression_opts=0)
         h5exporter.to_slice_image(
@@ -548,7 +557,7 @@ def _load_sim_line(csv_path):
         fd.write("Step,Cd,Cl\n")
         for i, (cd, cl) in enumerate(drag_values):
             fd.write(f"{i * print_interval},{cd},{cl}\n")
-    plot_drag_lift(drag_values, output_dir, print_interval, script_name)
+    plot_force_coefficients(drag_values, output_dir, print_interval, script_name)
 
 # Calculate and print average Cd and Cl for the last 50%
 drag_values_array = np.array(drag_values)
diff --git a/examples/cfd/rotating_sphere_3d.py b/examples/cfd/rotating_sphere_3d.py
index 235345b3..7a0b10a4 100644
--- a/examples/cfd/rotating_sphere_3d.py
+++ b/examples/cfd/rotating_sphere_3d.py
@@ -1,3 +1,11 @@
+"""
+Rotating sphere 3-D example (single-resolution).
+
+Simulates flow past a sphere rotating about the y-axis using the
+halfway bounce-back BC with a prescribed rotational-velocity profile.
+Computes drag and lift coefficients over time and saves VTK snapshots.
+"""
+
 import xlb
 import trimesh
 import time
@@ -86,7 +94,7 @@
 walls = np.unique(np.array(walls), axis=-1).tolist()
 
 # Load the mesh (replace with your own mesh)
-stl_filename = "examples/cfd/stl-files/sphere.stl"
+stl_filename = "../stl-files/sphere.stl"
 mesh = trimesh.load_mesh(stl_filename, process=False)
 mesh_vertices = mesh.vertices
 
@@ -105,6 +113,7 @@
 
 # Define rotating boundary profile
 def bc_profile():
+    """Build a Warp function returning the rotational wall velocity at a voxel."""
     dtype = precision_policy.compute_precision.wp_dtype
     _u_vec = wp.vec(velocity_set.d, dtype=dtype)
     angular_velocity = _u_vec(0.0, rot_rate, 0.0)
@@ -210,6 +219,7 @@ def post_process(
     lift_coefficients,
     time_steps,
 ):
+    """Compute macroscopic fields, force coefficients, and save VTK output."""
     """
     Post-process simulation data: save fields, compute forces, and plot drag coefficient.
 
diff --git a/examples/performance/mlups_3d_multires.py b/examples/performance/mlups_3d_multires.py
index 3a143764..ba71985b 100644
--- a/examples/performance/mlups_3d_multires.py
+++ b/examples/performance/mlups_3d_multires.py
@@ -1,3 +1,19 @@
+"""
+MLUPS benchmark for the multi-resolution LBM solver.
+
+Runs a lid-driven cavity simulation on a multi-resolution Neon grid and
+reports the Equivalent Million Lattice Updates Per Second (EMLUPS).
+
+Usage::
+
+    python mlups_3d_multires.py <cube_edge> <num_steps> neon <precision> \\
+           <num_levels> <mres_perf_opt> [options]
+
+Example::
+
+    python mlups_3d_multires.py 100 1000 neon fp32/fp32 2 NAIVE_COLLIDE_STREAM
+"""
+
 import xlb
 import argparse
 import time
@@ -14,6 +30,7 @@
 
 
 def parse_arguments():
+    """Parse and validate command-line arguments."""
     parser = argparse.ArgumentParser(
         description="MLUPS for 3D Lattice Boltzmann Method Simulation with Multi-resolution Grid",
         epilog="""
@@ -82,6 +99,7 @@ def parse_arguments():
 
 
 def print_args(args):
+    """Print the simulation configuration to stdout."""
     # Print simulation configuration
     print("=" * 60)
     print("           3D LATTICE BOLTZMANN SIMULATION CONFIG")
@@ -104,6 +122,13 @@ def print_args(args):
 
 
 def setup_simulation(args):
+    """Initialize XLB globals (velocity set, backend, precision) from CLI args.
+
+    Returns
+    -------
+    VelocitySet
+        The configured lattice velocity set.
+    """
     compute_backend = None
     if args.compute_backend == "neon":
         compute_backend = ComputeBackend.NEON
@@ -137,7 +162,28 @@ def setup_simulation(args):
     return velocity_set
 
 
-def problem1(grid_shape, velocity_set, num_levels):
+def ldc_multires_setup(grid_shape, velocity_set, num_levels):
+    """Lid-driven cavity with refinement peeling inward from the boundary.
+
+    Each finer level covers only the outermost shell of its parent,
+    concentrating resolution near the walls.
+
+    Parameters
+    ----------
+    grid_shape : tuple of int
+        Domain size at the finest level.
+    velocity_set : VelocitySet
+        Lattice velocity set.
+    num_levels : int
+        Number of refinement levels.
+
+    Returns
+    -------
+    grid : NeonMultiresGrid
+    lid : list of index arrays (per level)
+    walls : list of index arrays (per level)
+    """
+
     def peel(dim, idx, peel_level, outwards):
         if outwards:
             xIn = idx.x <= peel_level or idx.x >= dim.x - 1 - peel_level
@@ -204,43 +250,6 @@ def get_levels(num_levels):
     return grid, lid, walls
 
 
-def problem2(grid_shape, velocity_set, num_levels):
-    # Example 2: Coarsest at the edges (2 level only)
-    level_origins = []
-    level_list = []
-    for lvl in range(num_levels):
-        divider = 2**lvl
-        growth = 1.5**lvl
-        shape = grid_shape[0] // divider, grid_shape[1] // divider, grid_shape[2] // divider
-        if lvl == num_levels - 1:
-            level = np.ascontiguousarray(np.ones(shape, dtype=int), dtype=np.int32)
-            box_origin = (0, 0, 0)  # The coarsest level has no origin offset
-        else:
-            box_size = tuple([int(shape[i] // 4 * growth) for i in range(3)])
-            box_origin = tuple([shape[i] // 2 - box_size[i] // 2 for i in range(3)])
-            level = np.ascontiguousarray(np.ones(box_size, dtype=int), dtype=np.int32)
-        level_list.append(level)
-        level_origins.append(neon.Index_3d(*box_origin))
-
-    # Create the multires grid
-    grid = multires_grid_factory(
-        grid_shape,
-        velocity_set=velocity_set,
-        sparsity_pattern_list=level_list,
-        sparsity_pattern_origins=level_origins,
-    )
-
-    box = grid.bounding_box_indices(shape=grid.level_to_shape(num_levels - 1))
-    box_no_edge = grid.bounding_box_indices(shape=grid.level_to_shape(1), remove_edges=True)
-    lid = box_no_edge["top"]
-    walls = [box["bottom"][i] + box["left"][i] + box["right"][i] + box["front"][i] + box["back"][i] for i in range(len(grid.shape))]
-    walls = np.unique(np.array(walls), axis=-1).tolist()
-    # convert bc indices to a list of list, where the first entry of the list corresponds to the finest level
-    lid = [[] for _ in range(num_levels - 1)] + [lid]
-    walls = [[] for _ in range(num_levels - 1)] + [walls]
-    return grid, lid, walls
-
-
 def run(
     velocity_set,
     grid_shape,
@@ -250,19 +259,15 @@ def run(
     export_final_velocity,
     mres_perf_opt,
 ):
-    # Create grid and setup boundary conditions
-
-    # Convert indices to list of indices per level
-    # TODO: overlaps emerge if bc indices are orignally specified at the finest grid and they exist at the coarser levels
-    # levels_mask = [lvl.astype(bool) for lvl in levels]
-    # lid = construct_indices_per_level(grid_shape, lid, levels_mask, level_origins)
-    # walls = construct_indices_per_level(grid_shape, walls, levels_mask, level_origins)
-
-    # Example 1: fine to coarse
-    # grid, lid, walls = problem1(grid_shape, velocity_set, num_levels)
+    """Set up and execute the benchmark simulation.
 
-    # Example 2: Coarse to fine:
-    grid, lid, walls = problem1(grid_shape, velocity_set, num_levels)
+    Returns
+    -------
+    dict
+        ``{"time": elapsed_seconds, "num_levels": int}``
+    """
+    # Create grid and setup boundary conditions
+    grid, lid, walls = ldc_multires_setup(grid_shape, velocity_set, num_levels)
 
     prescribed_vel = 0.1
     boundary_conditions = [
@@ -313,6 +318,16 @@ def run(
 
 
 def calculate_mlups(cube_edge, num_steps, elapsed_time, num_levels):
+    """Compute the Equivalent Million Lattice Updates Per Second (EMLUPS).
+
+    The metric accounts for the fact that finer levels are stepped
+    2^(num_levels-1) times per coarsest-level step.
+
+    Returns
+    -------
+    dict
+        ``{"EMLUPS": float, "finer_steps": int}``
+    """
     num_step_finer = num_steps * 2 ** (num_levels - 1)
     total_lattice_updates = cube_edge**3 * num_step_finer
     mlups = (total_lattice_updates / elapsed_time) / 1e6
diff --git a/xlb/compute_backend.py b/xlb/compute_backend.py
index 6b4ed702..d53ff8a4 100644
--- a/xlb/compute_backend.py
+++ b/xlb/compute_backend.py
@@ -1,9 +1,18 @@
-# Enum used to keep track of the compute backends
+"""
+Compute-backend enumeration for XLB.
+"""
 
 from enum import Enum, auto
 
 
 class ComputeBackend(Enum):
+    """Available compute backends.
+
+    ``JAX``  — single-res, multi-GPU/TPU via JAX.
+    ``WARP`` — single-res, single-GPU CUDA via NVIDIA Warp.
+    ``NEON`` — single-res and multi-res, single-GPU CUDA via Neon (uses Warp kernels internally).
+    """
+
     JAX = auto()
     WARP = auto()
     NEON = auto()
diff --git a/xlb/default_config.py b/xlb/default_config.py
index f709f0d5..078128b8 100644
--- a/xlb/default_config.py
+++ b/xlb/default_config.py
@@ -1,3 +1,11 @@
+"""
+Global configuration for XLB.
+
+Call :func:`init` once at the start of every script to select the velocity
+set, compute backend, and precision policy.  All operators read their
+defaults from :class:`DefaultConfig` when explicit arguments are omitted.
+"""
+
 from xlb.compute_backend import ComputeBackend
 from dataclasses import dataclass
 from xlb.precision_policy import PrecisionPolicy
@@ -5,12 +13,40 @@
 
 @dataclass
 class DefaultConfig:
+    """Singleton holding the active global configuration.
+
+    Attributes are set by :func:`init` and read by operators, grids, and
+    helpers throughout XLB.
+
+    Attributes
+    ----------
+    default_precision_policy : PrecisionPolicy or None
+        Active precision policy (compute / store dtype pair).
+    velocity_set : VelocitySet or None
+        Active lattice velocity set.
+    default_backend : ComputeBackend or None
+        Active compute backend.
+    """
+
     default_precision_policy = None
     velocity_set = None
     default_backend = None
 
 
 def init(velocity_set, default_backend, default_precision_policy):
+    """Initialize the global XLB configuration.
+
+    Must be called before creating any grid, operator, or field.
+
+    Parameters
+    ----------
+    velocity_set : VelocitySet
+        Lattice velocity set (e.g. ``D3Q19``).
+    default_backend : ComputeBackend
+        Compute backend to use (JAX, WARP, or NEON).
+    default_precision_policy : PrecisionPolicy
+        Precision policy for compute and storage dtypes.
+    """
     DefaultConfig.velocity_set = velocity_set
     DefaultConfig.default_backend = default_backend
     DefaultConfig.default_precision_policy = default_precision_policy
@@ -43,10 +79,12 @@ def init(velocity_set, default_backend, default_precision_policy):
 
 
 def default_backend() -> ComputeBackend:
+    """Return the currently configured compute backend."""
     return DefaultConfig.default_backend
 
 
 def check_backend_support():
+    """Print a summary of available JAX hardware accelerators."""
     import jax
 
     if jax.devices()[0].platform == "gpu":
diff --git a/xlb/grid/grid.py b/xlb/grid/grid.py
index 2d03a33a..90aeb2c6 100644
--- a/xlb/grid/grid.py
+++ b/xlb/grid/grid.py
@@ -1,3 +1,13 @@
+"""
+Grid abstraction and factory functions for XLB.
+
+Defines the :class:`Grid` abstract base class that every backend-specific
+grid must implement, plus two factory helpers:
+
+* :func:`grid_factory` — creates a single-resolution grid for any backend.
+* :func:`multires_grid_factory` — creates a multi-resolution grid (Neon only).
+"""
+
 from abc import ABC, abstractmethod
 from typing import Tuple, List
 import numpy as np
@@ -13,6 +23,24 @@ def grid_factory(
     velocity_set=None,
     backend_config=None,
 ):
+    """Create a single-resolution grid for the specified backend.
+
+    Parameters
+    ----------
+    shape : tuple of int
+        Domain dimensions, e.g. ``(nx, ny, nz)``.
+    compute_backend : ComputeBackend, optional
+        Backend to use.  Defaults to ``DefaultConfig.default_backend``.
+    velocity_set : VelocitySet, optional
+        Required for the Neon backend.
+    backend_config : dict, optional
+        Backend-specific configuration (Neon only).
+
+    Returns
+    -------
+    Grid
+        A backend-specific grid instance.
+    """
     compute_backend = compute_backend or DefaultConfig.default_backend
     velocity_set = velocity_set or DefaultConfig.velocity_set
     if compute_backend == ComputeBackend.WARP:
@@ -38,6 +66,26 @@ def multires_grid_factory(
     sparsity_pattern_list: List[np.ndarray] = [],
     sparsity_pattern_origins: List[neon.Index_3d] = [],
 ):
+    """Create a multi-resolution grid (Neon backend only).
+
+    Parameters
+    ----------
+    shape : tuple of int
+        Bounding-box dimensions at the finest level.
+    compute_backend : ComputeBackend, optional
+        Must be ``ComputeBackend.NEON``.
+    velocity_set : VelocitySet, optional
+        Lattice velocity set.
+    sparsity_pattern_list : list of np.ndarray
+        Active-voxel masks, one per level (finest first).
+    sparsity_pattern_origins : list of neon.Index_3d
+        Origin of each level's pattern in finest-level coordinates.
+
+    Returns
+    -------
+    NeonMultiresGrid
+        A multi-resolution Neon grid.
+    """
     compute_backend = compute_backend or DefaultConfig.default_backend
     velocity_set = velocity_set or DefaultConfig.velocity_set
     if compute_backend == ComputeBackend.NEON:
@@ -51,6 +99,20 @@ def multires_grid_factory(
 
 
 class Grid(ABC):
+    """Abstract base class for all XLB computational grids.
+
+    Subclasses must implement :meth:`_initialize_backend` to set up the
+    backend-specific data structures and :meth:`create_field` (not
+    enforced by ABC but expected by all operators).
+
+    Parameters
+    ----------
+    shape : tuple of int
+        Domain dimensions.
+    compute_backend : ComputeBackend
+        The compute backend this grid is associated with.
+    """
+
     def __init__(
         self,
         shape: Tuple[int, ...],
@@ -66,6 +128,7 @@ def _initialize_backend(self):
         pass
 
     def get_compute_backend(self):
+        """Return the compute backend associated with this grid."""
         return self.compute_backend
 
     def bounding_box_indices(self, shape=None, remove_edges=False):
diff --git a/xlb/grid/multires_grid.py b/xlb/grid/multires_grid.py
index e8929e66..582dd27e 100644
--- a/xlb/grid/multires_grid.py
+++ b/xlb/grid/multires_grid.py
@@ -1,3 +1,12 @@
+"""
+Multi-resolution sparse grid backed by the Neon ``mGrid`` runtime.
+
+This module wraps ``neon.multires.mGrid`` and exposes it through the
+:class:`Grid` interface.  The grid is hierarchical: level 0 is the finest
+and level *N-1* is the coarsest.  Each coarser level has half the
+resolution of the level below it (refinement factor 2).
+"""
+
 import numpy as np
 import warp as wp
 import neon
@@ -9,6 +18,26 @@
 
 
 class NeonMultiresGrid(Grid):
+    """Hierarchical multi-resolution grid on the Neon backend.
+
+    Wraps ``neon.multires.mGrid``.  Each level is described by a boolean
+    sparsity pattern (active-voxel mask) and an integer origin that
+    places it within the finest-level coordinate system.
+
+    Parameters
+    ----------
+    shape : tuple of int
+        Bounding-box dimensions at the **finest** level ``(nx, ny, nz)``.
+    velocity_set : VelocitySet
+        Lattice velocity set defining neighbour connectivity.
+    sparsity_pattern_list : list of np.ndarray
+        One boolean/int array per level indicating which voxels are active.
+        Index 0 = finest level, index *N-1* = coarsest.
+    sparsity_pattern_origins : list of neon.Index_3d
+        Origin offset for each level's pattern in the finest-level
+        coordinate system.
+    """
+
     def __init__(
         self,
         shape,
@@ -54,12 +83,6 @@ def _initialize_backend(self):
 
         self.bk = neon.Backend(runtime=neon.Backend.Runtime.stream, dev_idx_list=dev_idx_list)
 
-        """
-         backend: neon.Backend,
-         dim,
-         sparsity_pattern_list: List[np.ndarray],
-         sparsity_pattern_origins: List[neon.Index_3d],
-         stencil: List[List[int]]):"""
         self.grid = neon.multires.mGrid(
             backend=self.bk,
             dim=self.dim,
@@ -78,6 +101,28 @@ def create_field(
         fill_value=None,
         neon_memory_type: neon.MemoryType = neon.MemoryType.host_device(),
     ):
+        """Allocate a new multi-resolution Neon field.
+
+        The field spans all grid levels.  Each level is either zero-filled
+        or filled with *fill_value*.
+
+        Parameters
+        ----------
+        cardinality : int
+            Number of components per voxel.
+        dtype : Precision, optional
+            Element precision.  Defaults to the store precision from the
+            global config.
+        fill_value : float, optional
+            Value to fill every element with.  ``None`` means zero.
+        neon_memory_type : neon.MemoryType
+            Memory residency (host, device, or both).
+
+        Returns
+        -------
+        neon.multires.mField
+            The newly allocated multi-resolution field.
+        """
         dtype = dtype.wp_dtype if dtype else DefaultConfig.default_precision_policy.store_precision.wp_dtype
         field = self.grid.new_field(
             cardinality=cardinality,
@@ -92,9 +137,15 @@ def create_field(
         return field
 
     def get_neon_backend(self):
+        """Return the underlying ``neon.Backend`` instance."""
         return self.bk
 
     def level_to_shape(self, level):
+        """Return the bounding-box shape at the given grid level.
+
+        Level 0 is the finest and has shape ``self.shape``.  Each subsequent
+        level halves each dimension.
+        """
         # level = 0 corresponds to the finest level
         return tuple(x // self.refinement_factor**level for x in self.shape)
 
diff --git a/xlb/grid/neon_grid.py b/xlb/grid/neon_grid.py
index e775535d..e92eb7ff 100644
--- a/xlb/grid/neon_grid.py
+++ b/xlb/grid/neon_grid.py
@@ -1,3 +1,11 @@
+"""
+Single-resolution dense grid backed by the Neon multi-GPU runtime.
+
+This module wraps ``neon.dense.dGrid`` and exposes it through the
+:class:`Grid` interface so that XLB operators can allocate and operate on
+fields transparently.
+"""
+
 import neon
 from .grid import Grid
 from xlb.precision_policy import Precision
@@ -7,10 +15,28 @@
 
 
 class NeonGrid(Grid):
+    """Dense single-resolution grid on the Neon backend.
+
+    Wraps a ``neon.dense.dGrid``.  The grid is initialized with the LBM
+    stencil derived from the provided *velocity_set* so that Neon can
+    set up the correct halo exchanges for neighbour communication.
+
+    Parameters
+    ----------
+    shape : tuple of int
+        Bounding-box dimensions of the domain ``(nx, ny, nz)`` (or
+        ``(nx, ny)`` for 2-D).
+    velocity_set : VelocitySet
+        Lattice velocity set whose stencil defines neighbour connectivity.
+    backend_config : dict, optional
+        Neon backend configuration.  Must contain ``"device_list"`` (list
+        of GPU device indices).  Defaults to ``{"device_list": [0]}``.
+    """
+
     def __init__(
         self,
-        shape,  # bounding box of the domain
-        velocity_set,  # velocity set for the grid
+        shape,
+        velocity_set,
         backend_config=None,
     ):
         from .warp_grid import WarpGrid
@@ -76,6 +102,24 @@ def create_field(
         dtype: Literal[Precision.FP32, Precision.FP64, Precision.FP16] = None,
         fill_value=None,
     ):
+        """Allocate a new Neon field on this grid.
+
+        Parameters
+        ----------
+        cardinality : int
+            Number of components per voxel (e.g. ``q`` for populations).
+        dtype : Precision, optional
+            Element precision.  Defaults to the store precision from the
+            global config.
+        fill_value : float, optional
+            If provided every element is set to this value; otherwise the
+            field is zero-initialized.
+
+        Returns
+        -------
+        neon.dense.dField
+            The newly allocated field.
+        """
         dtype = dtype.wp_dtype if dtype else DefaultConfig.default_precision_policy.store_precision.wp_dtype
         field = self.grid.new_field(
             cardinality=cardinality,
@@ -89,4 +133,5 @@ def create_field(
         return field
 
     def get_neon_backend(self):
+        """Return the underlying ``neon.Backend`` instance."""
         return self.bk
diff --git a/xlb/helper/initializers.py b/xlb/helper/initializers.py
index a1fd7107..78e9e83b 100644
--- a/xlb/helper/initializers.py
+++ b/xlb/helper/initializers.py
@@ -1,3 +1,17 @@
+"""
+Initializers for distribution function fields.
+
+Provides helper functions and Operator subclasses that populate
+distribution-function fields with equilibrium values.  Two usage patterns
+are supported:
+
+* **Functional helpers** (`initialize_eq`, `initialize_multires_eq`) —
+  one-shot initialization used during simulation setup.
+* **Operator classes** (`CustomInitializer`, `CustomMultiresInitializer`) —
+  reusable operators that can target the whole domain or a single boundary
+  condition region, with support for JAX, Warp, and Neon backends.
+"""
+
 import warp as wp
 from typing import Any
 from xlb import DefaultConfig
@@ -10,6 +24,35 @@
 
 
 def initialize_eq(f, grid, velocity_set, precision_policy, compute_backend, rho=None, u=None):
+    """Initialize a distribution-function field to equilibrium.
+
+    Computes the quadratic equilibrium for the given density and velocity
+    fields and writes it into *f*.  When *rho* or *u* are ``None`` the
+    defaults are uniform density 1 and zero velocity.
+
+    Parameters
+    ----------
+    f : field
+        Distribution-function field to populate (modified in-place for
+        Warp / Neon backends; replaced for JAX).
+    grid : Grid
+        Computational grid used to allocate temporary fields.
+    velocity_set : VelocitySet
+        Lattice velocity set (e.g. D3Q19).
+    precision_policy : PrecisionPolicy
+        Precision policy for compute / store dtypes.
+    compute_backend : ComputeBackend
+        Active compute backend (JAX, WARP, or NEON).
+    rho : field, optional
+        Density field.  Defaults to uniform 1.0.
+    u : field, optional
+        Velocity field.  Defaults to uniform 0.0.
+
+    Returns
+    -------
+    field
+        The initialized distribution-function field.
+    """
     if rho is None:
         rho = grid.create_field(cardinality=1, fill_value=1.0, dtype=precision_policy.compute_precision)
     if u is None:
@@ -31,12 +74,59 @@ def initialize_eq(f, grid, velocity_set, precision_policy, compute_backend, rho=
 
 
 def initialize_multires_eq(f, grid, velocity_set, precision_policy, backend, rho, u):
+    """Initialize a multi-resolution distribution-function field to equilibrium.
+
+    Parameters
+    ----------
+    f : field
+        Multi-resolution distribution-function field to populate.
+    grid : NeonMultiresGrid
+        Multi-resolution grid.
+    velocity_set : VelocitySet
+        Lattice velocity set.
+    precision_policy : PrecisionPolicy
+        Precision policy.
+    backend : ComputeBackend
+        Compute backend (expected to be NEON).
+    rho : field
+        Density field across all grid levels.
+    u : field
+        Velocity field across all grid levels.
+
+    Returns
+    -------
+    field
+        The initialized multi-resolution distribution-function field.
+    """
     equilibrium = MultiresQuadraticEquilibrium()
     return equilibrium(rho, u, f, stream=0)
 
 
-# Defining an initializer operator that initializes the entire domain or the specified BC to a constant velocity and density
 class CustomInitializer(Operator):
+    """Operator that initializes distribution functions to equilibrium.
+
+    When ``bc_id == -1`` (default) the entire domain is initialized with the
+    given constant velocity and density.  Otherwise only voxels whose
+    ``bc_mask`` matches *bc_id* are set while the rest receive the
+    weight-only equilibrium (zero velocity, unit density).
+
+    Supports JAX, Warp, and Neon backends.
+
+    Parameters
+    ----------
+    constant_velocity_vector : list of float
+        Macroscopic velocity [ux, uy, uz] used for initialization.
+    constant_density : float
+        Macroscopic density used for initialization.
+    bc_id : int
+        Boundary-condition ID to target.  ``-1`` means the whole domain.
+    initialization_operator : Operator, optional
+        Equilibrium operator to use.  Defaults to ``QuadraticEquilibrium``.
+    velocity_set : VelocitySet, optional
+    precision_policy : PrecisionPolicy, optional
+    compute_backend : ComputeBackend, optional
+    """
+
     def __init__(
         self,
         constant_velocity_vector=[0.0, 0.0, 0.0],
@@ -163,8 +253,13 @@ def neon_implementation(self, bc_mask, f_field, stream=0):
         return f_field
 
 
-# Defining an initializer for outlet only
 class CustomMultiresInitializer(CustomInitializer):
+    """Multi-resolution variant of :class:`CustomInitializer`.
+
+    Iterates over all grid levels and initializes distribution functions
+    using the Neon multi-resolution container API.
+    """
+
     def __init__(
         self,
         constant_velocity_vector=[0.0, 0.0, 0.0],
diff --git a/xlb/helper/nse_fields.py b/xlb/helper/nse_fields.py
index c513f735..81e01006 100644
--- a/xlb/helper/nse_fields.py
+++ b/xlb/helper/nse_fields.py
@@ -1,3 +1,11 @@
+"""
+Factory function for creating the standard Navier-Stokes field arrays.
+
+Returns the distribution-function pair (*f_0*, *f_1*), the boundary-
+condition mask, and the missing-population mask, all allocated on the
+given grid and backend.
+"""
+
 from xlb import DefaultConfig
 from xlb.grid import grid_factory
 from xlb.precision_policy import Precision
diff --git a/xlb/helper/simulation_manager.py b/xlb/helper/simulation_manager.py
index 3d7e95b0..da7b2a32 100644
--- a/xlb/helper/simulation_manager.py
+++ b/xlb/helper/simulation_manager.py
@@ -1,3 +1,12 @@
+"""
+High-level simulation manager for multi-resolution LBM on the Neon backend.
+
+:class:`MultiresSimulationManager` orchestrates the complete simulation
+lifecycle: field allocation, boundary-condition setup, coalescence-factor
+precomputation, and the recursive time-stepping skeleton that correctly
+interleaves coarse and fine grid updates.
+"""
+
 import neon
 import warp as wp
 from xlb.operator.stepper import MultiresIncompressibleNavierStokesStepper
@@ -6,8 +15,32 @@
 
 
 class MultiresSimulationManager(MultiresIncompressibleNavierStokesStepper):
-    """
-    A simulation manager for multiresolution simulations using the Neon backend in XLB.
+    """Orchestrates multi-resolution LBM simulations on the Neon backend.
+
+    Inherits from :class:`MultiresIncompressibleNavierStokesStepper` and
+    adds field management, omega computation across levels, and the
+    recursive skeleton builder that encodes the multi-resolution
+    time-stepping order.
+
+    Parameters
+    ----------
+    omega_finest : float
+        Relaxation parameter at the finest grid level.
+    grid : NeonMultiresGrid
+        Multi-resolution grid.
+    boundary_conditions : list of BoundaryCondition
+        Boundary conditions to apply.
+    collision_type : str
+        ``"BGK"`` or ``"KBC"``.
+    forcing_scheme : str
+        Forcing scheme (used only when *force_vector* is given).
+    force_vector : array-like, optional
+        External body force.
+    initializer : Operator, optional
+        Custom initializer for distribution functions.  If ``None``
+        the default equilibrium initialization is used.
+    mres_perf_opt : MresPerfOptimizationType
+        Performance optimization strategy.
     """
 
     def __init__(
@@ -73,6 +106,14 @@ def compute_omega(self, omega_finest, level):
         return 2 ** (level + 1) * omega0 / ((2**level - 1.0) * omega0 + 2.0)
 
     def export_macroscopic(self, fname_prefix):
+        """Compute macroscopic fields and export velocity to a VTI file.
+
+        Parameters
+        ----------
+        fname_prefix : str
+            Output filename prefix.  The iteration index is appended
+            automatically (e.g. ``"u_"`` → ``"u_42.vti"``).
+        """
         print(f"exporting macroscopic: #levels {self.count_levels}")
         self.macro(self.f_0, self.bc_mask, self.rho, self.u, streamId=0)
 
@@ -85,6 +126,12 @@ def export_macroscopic(self, fname_prefix):
         return
 
     def step(self):
+        """Advance the simulation by one coarsest-level timestep.
+
+        Internally this executes the pre-compiled Neon skeleton which
+        performs the correct number of sub-steps at each finer level
+        according to the acoustic-scaling time refinement ratio.
+        """
         self.iteration_idx = self.iteration_idx + 1
         self.sk.run()
 
@@ -124,6 +171,13 @@ def _build_recursion(self, level, app, config):
             self.add_to_app(app=app, op_name=op_name, level=level, **fields_swapped, **extra)
 
     def _construct_stepper_skeleton(self):
+        """Build the Neon skeleton that encodes the recursive time-stepping order.
+
+        The skeleton is a list of Neon container invocations that, when
+        executed in sequence, perform one coarsest-level timestep with the
+        correct sub-cycling at finer levels.  The structure depends on
+        ``self.mres_perf_opt``.
+        """
         self.app = []
 
         stream_abc = {"omega": self.coalescence_factor, "timestep": 0}
diff --git a/xlb/mres_perf_optimization_type.py b/xlb/mres_perf_optimization_type.py
index 0a1f968d..797699f5 100644
--- a/xlb/mres_perf_optimization_type.py
+++ b/xlb/mres_perf_optimization_type.py
@@ -1,3 +1,10 @@
+"""
+Multi-resolution performance-optimization strategies.
+
+Defines the kernel-fusion levels available for the multi-resolution LBM
+stepper and provides CLI argument parsing helpers.
+"""
+
 import argparse
 from enum import Enum, auto
 
diff --git a/xlb/operator/boundary_condition/bc_do_nothing.py b/xlb/operator/boundary_condition/bc_do_nothing.py
index 97b7825c..7c5ae0b5 100644
--- a/xlb/operator/boundary_condition/bc_do_nothing.py
+++ b/xlb/operator/boundary_condition/bc_do_nothing.py
@@ -1,5 +1,8 @@
 """
-Base class for boundary conditions in a LBM simulation.
+Do-nothing boundary condition.
+
+Skips the streaming step at tagged boundary voxels, leaving the
+populations unchanged.
 """
 
 import jax.numpy as jnp
diff --git a/xlb/operator/boundary_condition/bc_equilibrium.py b/xlb/operator/boundary_condition/bc_equilibrium.py
index f24fec30..85ebe92b 100644
--- a/xlb/operator/boundary_condition/bc_equilibrium.py
+++ b/xlb/operator/boundary_condition/bc_equilibrium.py
@@ -22,8 +22,20 @@
 
 
 class EquilibriumBC(BoundaryCondition):
-    """
-    Full Bounce-back boundary condition for a lattice Boltzmann method simulation.
+    """Equilibrium boundary condition.
+
+    Sets populations at tagged voxels to the equilibrium distribution
+    computed from the prescribed macroscopic density *rho* and velocity
+    *u*.  Commonly used as an inlet or outlet condition.
+
+    Parameters
+    ----------
+    rho : float
+        Prescribed macroscopic density.
+    u : tuple of float
+        Prescribed macroscopic velocity ``(ux, uy, uz)``.
+    equilibrium_operator : Operator, optional
+        Equilibrium operator.  Defaults to ``QuadraticEquilibrium``.
     """
 
     def __init__(
diff --git a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
index 434ad542..4ff8bbfa 100644
--- a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
+++ b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
@@ -1,5 +1,14 @@
 """
-Base class for boundary conditions in a LBM simulation.
+Extrapolation outflow boundary condition.
+
+Uses first-order extrapolation from the interior to set the unknown
+populations at outflow boundaries, avoiding strong wave reflections.
+
+Reference
+---------
+Geier, M. et al. (2015). "The cumulant lattice Boltzmann equation in
+three dimensions: Theory and validation." *Computers & Mathematics
+with Applications*, 70(4), 507-547.
 """
 
 import jax.numpy as jnp
diff --git a/xlb/operator/boundary_condition/bc_fullway_bounce_back.py b/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
index 7d2bf3ab..4b7f8f0f 100644
--- a/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
@@ -1,5 +1,8 @@
 """
-Base class for boundary conditions in a LBM simulation.
+Full-way bounce-back boundary condition.
+
+Reverses every population at tagged solid voxels, effectively
+imposing a no-slip wall located *on* the grid node.
 """
 
 import jax.numpy as jnp
diff --git a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
index fa78705e..2404b48a 100644
--- a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
@@ -1,5 +1,10 @@
 """
-Base class for boundary conditions in a LBM simulation.
+Halfway bounce-back boundary condition.
+
+Implements the standard halfway bounce-back scheme where the no-slip
+wall is located halfway between a solid node and a fluid node.
+Optionally supports prescribed wall velocity (moving walls) and
+interpolated variants that use wall-distance data.
 """
 
 import jax.numpy as jnp
diff --git a/xlb/operator/boundary_condition/bc_hybrid.py b/xlb/operator/boundary_condition/bc_hybrid.py
index 2e8d5db6..12a9f4e4 100644
--- a/xlb/operator/boundary_condition/bc_hybrid.py
+++ b/xlb/operator/boundary_condition/bc_hybrid.py
@@ -1,3 +1,20 @@
+"""
+Hybrid boundary condition combining interpolated bounce-back with regularization.
+
+Provides three wall-treatment strategies, selectable via *bc_method*:
+
+* ``"bounceback_regularized"`` — interpolated bounce-back + Latt regularization.
+* ``"bounceback_grads"`` — interpolated bounce-back + Grad's approximation.
+* ``"nonequilibrium_regularized"`` — Tao non-equilibrium bounce-back + Latt
+  regularization.
+
+All variants optionally support:
+
+* Moving walls (via *prescribed_value* or *profile*).
+* Curved boundaries with fractional distance to the mesh surface (via
+  *use_mesh_distance*).
+"""
+
 import inspect
 from jax import jit
 from functools import partial
@@ -42,6 +59,31 @@ def __init__(
         voxelization_method: MeshVoxelizationMethod = None,
         use_mesh_distance=False,
     ):
+        """
+        Parameters
+        ----------
+        bc_method : str
+            Wall-treatment strategy.  One of ``"bounceback_regularized"``,
+            ``"bounceback_grads"``, or ``"nonequilibrium_regularized"``.
+        profile : callable, optional
+            Warp function ``(index) -> u_vec`` or ``(index, timestep) -> u_vec``
+            defining the wall velocity.  Mutually exclusive with *prescribed_value*.
+        prescribed_value : float or array-like, optional
+            Constant wall velocity vector.  Mutually exclusive with *profile*.
+            If neither is given, a no-slip wall is assumed.
+        velocity_set : VelocitySet, optional
+        precision_policy : PrecisionPolicy, optional
+        compute_backend : ComputeBackend, optional
+        indices : list of array-like, optional
+            Boundary voxel indices (use this **or** *mesh_vertices*, not both).
+        mesh_vertices : np.ndarray, optional
+            Mesh triangle vertices for mesh-based voxelization.
+        voxelization_method : MeshVoxelizationMethod, optional
+            Voxelization strategy (AABB, RAY, AABB_CLOSE, etc.).
+        use_mesh_distance : bool
+            If ``True``, fractional distances to the mesh surface are
+            computed and stored for interpolated boundary schemes.
+        """
         assert bc_method in [
             "bounceback_regularized",
             "bounceback_grads",
diff --git a/xlb/operator/boundary_condition/bc_regularized.py b/xlb/operator/boundary_condition/bc_regularized.py
index 8f369c47..1ea1f84a 100644
--- a/xlb/operator/boundary_condition/bc_regularized.py
+++ b/xlb/operator/boundary_condition/bc_regularized.py
@@ -1,5 +1,13 @@
 """
-Base class for boundary conditions in a LBM simulation.
+Regularized boundary condition.
+
+A non-equilibrium bounce-back scheme with additional regularization of the
+distribution function.  Applicable as velocity or pressure boundary conditions.
+
+Reference
+---------
+Latt, J. et al. (2008). "Straight velocity boundaries in the lattice
+Boltzmann method." *Physical Review E*, 77(5), 056703.
 """
 
 import jax.numpy as jnp
diff --git a/xlb/operator/boundary_condition/bc_zouhe.py b/xlb/operator/boundary_condition/bc_zouhe.py
index bc4efb44..9865585c 100644
--- a/xlb/operator/boundary_condition/bc_zouhe.py
+++ b/xlb/operator/boundary_condition/bc_zouhe.py
@@ -1,5 +1,14 @@
 """
-Base class for boundary conditions in a LBM simulation.
+Zou-He boundary condition.
+
+Sets unknown populations at velocity or pressure boundaries using
+mass and momentum conservation combined with non-equilibrium
+bounce-back.  Commonly used for inlets and outlets.
+
+Reference
+---------
+Zou, Q. & He, X. (1997). "On pressure and velocity boundary conditions
+for the lattice Boltzmann BGK model." *Physics of Fluids*, 9(6), 1591.
 """
 
 import jax.numpy as jnp
diff --git a/xlb/operator/boundary_condition/boundary_condition.py b/xlb/operator/boundary_condition/boundary_condition.py
index 69624d70..4d44dcc6 100644
--- a/xlb/operator/boundary_condition/boundary_condition.py
+++ b/xlb/operator/boundary_condition/boundary_condition.py
@@ -1,5 +1,9 @@
 """
-Base class for boundary conditions in a LBM simulation.
+Base class for boundary conditions in a Lattice Boltzmann simulation.
+
+Every concrete BC inherits from :class:`BoundaryCondition`, which provides
+a registration mechanism, helper-function access, and the boilerplate
+needed to encode auxiliary data into the ``f_1`` buffer.
 """
 
 from enum import Enum, auto
@@ -20,15 +24,36 @@
 import neon
 
 
-# Enum for implementation step
 class ImplementationStep(Enum):
+    """At which algorithmic stage the boundary condition is applied."""
+
     COLLISION = auto()
     STREAMING = auto()
 
 
 class BoundaryCondition(Operator):
-    """
-    Base class for boundary conditions in a LBM simulation.
+    """Abstract base class for all LBM boundary conditions.
+
+    Each BC is registered with a unique numeric *id* and annotated with:
+
+    * ``implementation_step`` - whether it executes after streaming or after
+      collision.
+    * ``needs_aux_recovery`` / ``needs_aux_init`` - whether the BC stores
+      auxiliary data in the ``f_1`` distribution buffer.
+
+    Parameters
+    ----------
+    implementation_step : ImplementationStep
+        Phase in the LBM algorithm where this BC is applied.
+    velocity_set : VelocitySet, optional
+    precision_policy : PrecisionPolicy, optional
+    compute_backend : ComputeBackend, optional
+    indices : array-like, optional
+        Explicit voxel indices for this BC.
+    mesh_vertices : array-like, optional
+        Mesh vertices for geometry-based BCs.
+    voxelization_method : MeshVoxelizationMethod, optional
+        Voxelization strategy when *mesh_vertices* is provided.
     """
 
     def __init__(
diff --git a/xlb/operator/boundary_condition/helper_functions_bc.py b/xlb/operator/boundary_condition/helper_functions_bc.py
index e7732b07..1f981219 100644
--- a/xlb/operator/boundary_condition/helper_functions_bc.py
+++ b/xlb/operator/boundary_condition/helper_functions_bc.py
@@ -1,3 +1,16 @@
+"""
+Warp/Neon helper functions shared by multiple boundary conditions.
+
+:class:`HelperFunctionsBC` exposes ``@wp.func`` helpers for bounce-back,
+regularization, Grad's approximation, moving-wall corrections,
+interpolated BCs, and BC thread-data loading.  These are used as building
+blocks by the concrete BC classes.
+
+Also contains :class:`EncodeAuxiliaryData` and
+:class:`MultiresEncodeAuxiliaryData` operators for writing user-prescribed
+BC profiles into the ``f_1`` buffer during initialization.
+"""
+
 import inspect
 from typing import Any, Callable
 
@@ -14,6 +27,18 @@
 
 
 class HelperFunctionsBC(object):
+    """Collection of Warp/Neon ``@wp.func`` helpers for boundary conditions.
+
+    Parameters
+    ----------
+    velocity_set : VelocitySet, optional
+    precision_policy : PrecisionPolicy, optional
+    compute_backend : ComputeBackend, optional
+        Must be ``WARP`` or ``NEON`` (JAX not supported).
+    distance_decoder_function : callable, optional
+        Function to decode wall-distance data for interpolated BCs.
+    """
+
     def __init__(self, velocity_set=None, precision_policy=None, compute_backend=None, distance_decoder_function=None):
         if compute_backend == ComputeBackend.JAX:
             raise ValueError("This helper class contains helper functions only for the WARP implementation of some BCs not JAX!")
diff --git a/xlb/operator/boundary_masker/aabb.py b/xlb/operator/boundary_masker/aabb.py
index cd5bb8c0..98513a9b 100644
--- a/xlb/operator/boundary_masker/aabb.py
+++ b/xlb/operator/boundary_masker/aabb.py
@@ -1,3 +1,10 @@
+"""
+AABB mesh-based boundary masker.
+
+Voxelizes an STL mesh using ``warp.mesh_query_aabb`` for approximate
+one-voxel-thick surface detection around the geometry.
+"""
+
 import warp as wp
 import neon
 from typing import Any
diff --git a/xlb/operator/boundary_masker/aabb_close.py b/xlb/operator/boundary_masker/aabb_close.py
index 68398b61..21ee31f9 100644
--- a/xlb/operator/boundary_masker/aabb_close.py
+++ b/xlb/operator/boundary_masker/aabb_close.py
@@ -1,4 +1,15 @@
-# Base class for all equilibriums
+"""
+AABB-Close boundary masker with morphological close operation.
+
+Identifies solid voxels via axis-aligned bounding-box (AABB) intersection,
+then applies a morphological *close* (dilate followed by erode) to fill
+thin gaps and small cavities in the mesh surface.  The resulting solid
+mask is used to determine boundary voxels and their missing population
+directions.
+
+Supports both Warp (single-resolution) and Neon (multi-resolution)
+backends.
+"""
 
 import numpy as np
 import warp as wp
@@ -13,8 +24,20 @@
 
 
 class MeshMaskerAABBClose(MeshBoundaryMasker):
-    """
-    Operator for creating a boundary missing_mask from an STL file
+    """Boundary masker using AABB voxelization with morphological close.
+
+    The *close* operation (dilate then erode) thickens the raw solid mask
+    by ``close_voxels`` layers before shrinking it back, sealing small
+    holes and thin slits in the mesh surface.
+
+    Parameters
+    ----------
+    velocity_set : VelocitySet, optional
+    precision_policy : PrecisionPolicy, optional
+    compute_backend : ComputeBackend, optional
+    close_voxels : int
+        Half-width of the morphological structuring element.  Must be
+        provided explicitly.
     """
 
     def __init__(
diff --git a/xlb/operator/boundary_masker/helper_functions_masker.py b/xlb/operator/boundary_masker/helper_functions_masker.py
index 35e31194..565455fe 100644
--- a/xlb/operator/boundary_masker/helper_functions_masker.py
+++ b/xlb/operator/boundary_masker/helper_functions_masker.py
@@ -1,11 +1,18 @@
+"""
+Warp/Neon helper functions shared by boundary masker operators.
+"""
+
 import warp as wp
 from typing import Any
 from xlb import DefaultConfig, ComputeBackend
 
 
 class HelperFunctionsMasker(object):
-    """
-    A collection of helper functions used for the boundary masker operators.
+    """Warp ``@wp.func`` helpers for boundary masker operators.
+
+    Provides coordinate-conversion, bounds-checking, pull-index
+    computation, and BC-index membership tests used by the mesh and
+    indices boundary maskers on both Warp and Neon backends.
     """
 
     def __init__(self, velocity_set=None, precision_policy=None, compute_backend=None):
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index b51a1c92..da4d5c80 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -1,3 +1,10 @@
+"""
+Indices-based boundary masker.
+
+Creates boundary masks from explicit arrays of voxel indices, computing
+missing-population masks via pull-index tests for each tagged voxel.
+"""
+
 from typing import Any
 import copy
 
diff --git a/xlb/operator/boundary_masker/mesh_boundary_masker.py b/xlb/operator/boundary_masker/mesh_boundary_masker.py
index aa983eb2..c6fb778d 100644
--- a/xlb/operator/boundary_masker/mesh_boundary_masker.py
+++ b/xlb/operator/boundary_masker/mesh_boundary_masker.py
@@ -1,4 +1,9 @@
-# Base class for mesh masker operators
+"""
+Abstract base class for mesh-based boundary maskers.
+
+Provides shared input preparation logic (mesh construction, kernel arrays)
+used by AABB, Ray, Winding, and AABB-Close masker subclasses.
+"""
 
 import numpy as np
 import warp as wp
@@ -12,7 +17,7 @@
 
 class MeshBoundaryMasker(Operator):
     """
-    Operator for creating a boundary missing_mask from an STL file
+    Operator for creating a boundary missing_mask from a mesh file
     """
 
     def __init__(
diff --git a/xlb/operator/boundary_masker/mesh_voxelization_method.py b/xlb/operator/boundary_masker/mesh_voxelization_method.py
index 3d2b1d6b..b0162de7 100644
--- a/xlb/operator/boundary_masker/mesh_voxelization_method.py
+++ b/xlb/operator/boundary_masker/mesh_voxelization_method.py
@@ -1,9 +1,14 @@
-# A class used to keep track of the available voxelization methods
+"""
+Mesh voxelization method registry.
+
+Defines the available voxelization strategies (AABB, Ray, AABB-Close,
+Winding) and provides a factory function to create the corresponding
+:class:`VoxelizationMethod` data object.
+"""
 
 from dataclasses import dataclass
 
 
-# Registry
 METHODS = {
     "AABB": 1,
     "RAY": 2,
@@ -14,11 +19,37 @@
 
 @dataclass
 class VoxelizationMethod:
+    """Describes a mesh voxelization strategy.
+
+    Attributes
+    ----------
+    id : int
+        Numeric identifier for the method.
+    name : str
+        Human-readable name (``"AABB"``, ``"RAY"``, etc.).
+    options : dict
+        Extra options (e.g. ``close_voxels`` for AABB_CLOSE).
+    """
+
     id: int
     name: str
     options: dict
 
 
 def MeshVoxelizationMethod(name: str, **options):
+    """Create a :class:`VoxelizationMethod` by name.
+
+    Parameters
+    ----------
+    name : str
+        One of ``"AABB"``, ``"RAY"``, ``"AABB_CLOSE"``, ``"WINDING"``.
+    **options
+        Additional keyword arguments forwarded to
+        ``VoxelizationMethod.options``.
+
+    Returns
+    -------
+    VoxelizationMethod
+    """
     assert name in METHODS.keys(), f"Unsupported voxelization method: {name}"
     return VoxelizationMethod(METHODS[name], name, options)
diff --git a/xlb/operator/boundary_masker/multires_aabb.py b/xlb/operator/boundary_masker/multires_aabb.py
index 69e9e7bc..f2aa50c5 100644
--- a/xlb/operator/boundary_masker/multires_aabb.py
+++ b/xlb/operator/boundary_masker/multires_aabb.py
@@ -1,3 +1,7 @@
+"""
+Multi-resolution AABB mesh-based boundary masker for the Neon backend.
+"""
+
 import warp as wp
 from typing import Any
 from xlb.velocity_set.velocity_set import VelocitySet
diff --git a/xlb/operator/boundary_masker/multires_aabb_close.py b/xlb/operator/boundary_masker/multires_aabb_close.py
index 2f4976df..ffd50904 100644
--- a/xlb/operator/boundary_masker/multires_aabb_close.py
+++ b/xlb/operator/boundary_masker/multires_aabb_close.py
@@ -1,3 +1,10 @@
+"""
+Multi-resolution AABB-Close boundary masker with morphological closing.
+
+Extends the AABB-Close masker for Neon multi-resolution grids, applying
+dilate-then-erode operations to fill narrow channels with solid voxels.
+"""
+
 import warp as wp
 import neon
 from typing import Any
diff --git a/xlb/operator/boundary_masker/multires_indices_boundary_masker.py b/xlb/operator/boundary_masker/multires_indices_boundary_masker.py
index 1c3279b5..6223cb1b 100644
--- a/xlb/operator/boundary_masker/multires_indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/multires_indices_boundary_masker.py
@@ -1,3 +1,10 @@
+"""
+Multi-resolution indices-based boundary masker for the Neon backend.
+
+Creates boundary masks from explicit voxel indices on multi-resolution
+grids, computing missing-population masks for each tagged voxel.
+"""
+
 from typing import Any
 import copy
 import numpy as np
diff --git a/xlb/operator/boundary_masker/multires_ray.py b/xlb/operator/boundary_masker/multires_ray.py
index 4974cfd1..8719e33c 100644
--- a/xlb/operator/boundary_masker/multires_ray.py
+++ b/xlb/operator/boundary_masker/multires_ray.py
@@ -1,3 +1,7 @@
+"""
+Multi-resolution ray-cast mesh-based boundary masker for the Neon backend.
+"""
+
 import warp as wp
 from typing import Any
 from xlb.velocity_set.velocity_set import VelocitySet
diff --git a/xlb/operator/boundary_masker/ray.py b/xlb/operator/boundary_masker/ray.py
index 0eb0caef..5f44a2c2 100644
--- a/xlb/operator/boundary_masker/ray.py
+++ b/xlb/operator/boundary_masker/ray.py
@@ -1,3 +1,10 @@
+"""
+Ray-cast mesh-based boundary masker.
+
+Voxelizes a mesh file by casting rays along each lattice direction using
+``warp.mesh_query_ray`` to detect surface crossings.
+"""
+
 import warp as wp
 from typing import Any
 from xlb.velocity_set.velocity_set import VelocitySet
@@ -10,7 +17,7 @@
 
 class MeshMaskerRay(MeshBoundaryMasker):
     """
-    Operator for creating a boundary missing_mask from an STL file
+    Operator for creating a boundary missing_mask from a mesh file
     """
 
     def __init__(
diff --git a/xlb/operator/boundary_masker/winding.py b/xlb/operator/boundary_masker/winding.py
index 52b157a9..1510f3a5 100644
--- a/xlb/operator/boundary_masker/winding.py
+++ b/xlb/operator/boundary_masker/winding.py
@@ -1,3 +1,11 @@
+"""
+Winding-number mesh-based boundary masker.
+
+Uses the generalized winding-number test (``warp.mesh_query_point``) to
+classify voxels as inside or outside the mesh, providing a
+solid-detection method even for non-watertight geometries.
+"""
+
 import warp as wp
 from typing import Any
 from xlb.velocity_set.velocity_set import VelocitySet
@@ -10,7 +18,7 @@
 
 class MeshMaskerWinding(MeshBoundaryMasker):
     """
-    Operator for creating a boundary missing_mask from an STL file
+    Operator for creating a boundary missing_mask from a mesh file
     """
 
     def __init__(
diff --git a/xlb/operator/collision/bgk.py b/xlb/operator/collision/bgk.py
index d9a7fb95..29331dc7 100644
--- a/xlb/operator/collision/bgk.py
+++ b/xlb/operator/collision/bgk.py
@@ -1,3 +1,7 @@
+"""
+Bhatnagar-Gross-Krook (BGK) single-relaxation-time collision operator.
+"""
+
 import jax.numpy as jnp
 from jax import jit
 import warp as wp
@@ -10,8 +14,14 @@
 
 
 class BGK(Collision):
-    """
-    BGK collision operator for LBM.
+    """Single-relaxation-time BGK collision operator.
+
+    Relaxes the distribution function toward equilibrium at a rate
+    controlled by the relaxation parameter *omega*::
+
+        f_out = f - omega * (f - f_eq)
+
+    Supports JAX, Warp, and Neon backends.
     """
 
     @Operator.register_backend(ComputeBackend.JAX)
diff --git a/xlb/operator/collision/forced_collision.py b/xlb/operator/collision/forced_collision.py
index 4d97f2c4..55fa631c 100644
--- a/xlb/operator/collision/forced_collision.py
+++ b/xlb/operator/collision/forced_collision.py
@@ -1,3 +1,7 @@
+"""
+Collision operator with external body-force correction.
+"""
+
 import jax.numpy as jnp
 from jax import jit
 import warp as wp
@@ -11,8 +15,19 @@
 
 
 class ForcedCollision(Collision):
-    """
-    A collision operator for LBM with external force.
+    """Collision operator that wraps another collision with a forcing term.
+
+    After the inner collision the forcing operator is applied to
+    incorporate the effect of an external body force.
+
+    Parameters
+    ----------
+    collision_operator : Operator
+        The base collision operator (e.g. :class:`BGK`).
+    forcing_scheme : str
+        Forcing scheme.  Currently only ``"exact_difference"`` is supported.
+    force_vector : array-like
+        External force vector of length ``d`` (number of spatial dimensions).
     """
 
     def __init__(
diff --git a/xlb/operator/collision/smagorinsky_les_bgk.py b/xlb/operator/collision/smagorinsky_les_bgk.py
index 4dbd001e..aa552094 100644
--- a/xlb/operator/collision/smagorinsky_les_bgk.py
+++ b/xlb/operator/collision/smagorinsky_les_bgk.py
@@ -1,3 +1,7 @@
+"""
+BGK collision operator with Smagorinsky large-eddy-simulation sub-grid model.
+"""
+
 import jax.numpy as jnp
 from jax import jit
 import warp as wp
@@ -12,8 +16,19 @@
 
 
 class SmagorinskyLESBGK(Collision):
-    """
-    BGK collision operator for LBM with Smagorinsky LES model.
+    """BGK collision with Smagorinsky LES turbulence modelling.
+
+    Adjusts the effective relaxation time based on the local strain rate
+    estimated from the non-equilibrium stress tensor, using the
+    Smagorinsky model constant *C_s*.
+
+    Parameters
+    ----------
+    velocity_set : VelocitySet, optional
+    precision_policy : PrecisionPolicy, optional
+    compute_backend : ComputeBackend, optional
+    smagorinsky_coef : float
+        Smagorinsky model constant (default 0.17).
     """
 
     def __init__(
@@ -70,31 +85,6 @@ def functional(
             # Compute the non-equilibrium distribution
             fneq = f - feq
 
-            # Sailfish implementation
-            # {
-            #  float tmp, strain;
-
-            #  strain = 0.0f;
-
-            #  // Off-diagonal components, count twice for symmetry reasons.
-            #  %for a in range(0, dim):
-            #    %for b in range(a + 1, dim):
-            #       tmp = ${cex(sym.ex_flux(grid, 'd0', a, b, config), pointers=True)} -
-            #           ${cex(sym.ex_eq_flux(grid, a, b))};
-            #       strain += 2.0f * tmp * tmp;
-            #    %endfor
-            #  %endfor
-
-            #  // Diagonal components.
-            #  %for a in range(0, dim):
-            #    tmp = ${cex(sym.ex_flux(grid, 'd0', a, a, config), pointers=True)} -
-            #        ${cex(sym.ex_eq_flux(grid, a, a))};
-            #    strain += tmp * tmp;
-            #  %endfor
-
-            #  tau0 += 0.5f * (sqrtf(tau0 * tau0 + 36.0f * ${cex(smagorinsky_const**2)} * sqrtf(strain)) - tau0);
-            # }
-
             # Compute strain
             pi_neq = _pi_vec()
             for a in range(_pi_dim):
diff --git a/xlb/operator/equilibrium/multires_quadratic_equilibrium.py b/xlb/operator/equilibrium/multires_quadratic_equilibrium.py
index a539216e..101d4c4b 100644
--- a/xlb/operator/equilibrium/multires_quadratic_equilibrium.py
+++ b/xlb/operator/equilibrium/multires_quadratic_equilibrium.py
@@ -1,3 +1,7 @@
+"""
+Multi-resolution quadratic equilibrium operator for the Neon backend.
+"""
+
 import warp as wp
 import neon
 from typing import Any
@@ -7,9 +11,11 @@
 
 
 class MultiresQuadraticEquilibrium(QuadraticEquilibrium):
-    """
-    Quadratic equilibrium of Boltzmann equation using hermite polynomials.
-    Standard equilibrium model for LBM.
+    """Quadratic equilibrium operator for multi-resolution grids (Neon only).
+
+    Computes the second-order Hermite-polynomial equilibrium distribution
+    from density and velocity at every active cell on each grid level.
+    Cells that have child refinement (halo cells) are zeroed out.
     """
 
     def __init__(self, *args, **kwargs):
diff --git a/xlb/operator/force/multires_momentum_transfer.py b/xlb/operator/force/multires_momentum_transfer.py
index 485ae1e0..7622e467 100644
--- a/xlb/operator/force/multires_momentum_transfer.py
+++ b/xlb/operator/force/multires_momentum_transfer.py
@@ -1,3 +1,7 @@
+"""
+Multi-resolution momentum-transfer force operator for the Neon backend.
+"""
+
 from typing import Any
 
 import warp as wp
@@ -12,9 +16,23 @@
 
 
 class MultiresMomentumTransfer(MomentumTransfer):
-    """
-    Multiresolution Momentum Transfer operator for computing the force on a multiresolution grid.
-    This operator computes uses the same approach as its parent class for computing the forces.
+    """Momentum-transfer force computation on a multi-resolution grid.
+
+    Extends :class:`MomentumTransfer` with Neon-specific container code that
+    iterates over all grid levels.  The LBM operation sequence (collide-then-
+    stream vs. stream-then-collide) is inferred from the performance
+    optimization type.
+
+    Parameters
+    ----------
+    no_slip_bc_instance : BoundaryCondition
+        The no-slip BC whose tagged voxels define the force integration
+        surface.
+    mres_perf_opt : MresPerfOptimizationType
+        Multi-resolution performance strategy.
+    velocity_set : VelocitySet, optional
+    precision_policy : PrecisionPolicy, optional
+    compute_backend : ComputeBackend, optional
     """
 
     def __init__(
diff --git a/xlb/operator/macroscopic/multires_macroscopic.py b/xlb/operator/macroscopic/multires_macroscopic.py
index 4bc79b90..b841c295 100644
--- a/xlb/operator/macroscopic/multires_macroscopic.py
+++ b/xlb/operator/macroscopic/multires_macroscopic.py
@@ -1,3 +1,7 @@
+"""
+Multi-resolution macroscopic moment computation for the Neon backend.
+"""
+
 from functools import partial
 import jax.numpy as jnp
 from jax import jit
@@ -11,7 +15,12 @@
 
 
 class MultiresMacroscopic(Macroscopic):
-    """A class to compute both zero and first moments of distribution functions (rho, u) on a multi-resolution grid."""
+    """Compute density and velocity on a multi-resolution grid (Neon only).
+
+    Iterates over all grid levels, computing zero-th and first moments of
+    the distribution function.  Solid voxels and voxels that have child
+    refinement (halo cells) are set to zero.
+    """
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
diff --git a/xlb/operator/operator.py b/xlb/operator/operator.py
index ba71142e..2511b159 100644
--- a/xlb/operator/operator.py
+++ b/xlb/operator/operator.py
@@ -1,3 +1,17 @@
+"""
+Base operator module for XLB.
+
+Every LBM operator (collision, streaming, equilibrium, boundary condition,
+masker, stepper, etc.) inherits from :class:`Operator`.  The class provides:
+
+* **Backend dispatch** — ``__call__`` automatically selects the registered
+  implementation for the active compute backend.
+* **Precision management** — ``compute_dtype`` and ``store_dtype`` properties
+  return the correct type for the active backend and precision policy.
+* **Kernel construction hooks** — ``_construct_warp()`` / ``_construct_neon()``
+  are called at init time to compile backend-specific kernels and functionals.
+"""
+
 import inspect
 import traceback
 import jax
@@ -19,6 +33,17 @@ class Operator:
     _backends = {}
 
     def __init__(self, velocity_set=None, precision_policy=None, compute_backend=None):
+        """Initialize the operator.
+
+        Parameters
+        ----------
+        velocity_set : VelocitySet, optional
+            Lattice velocity set.  Defaults to ``DefaultConfig.velocity_set``.
+        precision_policy : PrecisionPolicy, optional
+            Precision policy.  Defaults to ``DefaultConfig.default_precision_policy``.
+        compute_backend : ComputeBackend, optional
+            Compute backend.  Defaults to ``DefaultConfig.default_backend``.
+        """
         # Set the default values from the global config
         self.velocity_set = velocity_set or DefaultConfig.velocity_set
         self.precision_policy = precision_policy or DefaultConfig.default_precision_policy
@@ -62,6 +87,20 @@ def decorator(func):
         return decorator
 
     def __call__(self, *args, callback=None, **kwargs):
+        """Dispatch to the registered backend implementation.
+
+        Iterates over all registered implementations for this operator class
+        and the active backend, attempts to bind the provided arguments, and
+        executes the first matching signature.  An optional *callback* is
+        invoked with the result after successful execution.
+
+        Raises
+        ------
+        NotImplementedError
+            If no implementation is registered for the active backend.
+        Exception
+            If all candidate implementations raise errors.
+        """
         method_candidates = [
             (key, method) for key, method in self._backends.items() if key[0] == self.__class__.__name__ and key[1] == self.compute_backend
         ]
@@ -179,6 +218,16 @@ def _construct_neon(self):
         return None, None
 
     def _construct_read_write_functions(self):
+        """Build backend-specific ``read_field`` / ``write_field`` helpers.
+
+        For the Warp backend these are direct 4-D array accesses.  For the
+        Neon backend they wrap ``wp.neon_read`` / ``wp.neon_write``.
+
+        Returns
+        -------
+        tuple of wp.func
+            ``(read_field, write_field)``
+        """
         if self.compute_backend == ComputeBackend.WARP:
 
             @wp.func
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index f969b97f..a42f3891 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -65,6 +65,7 @@ def my_kernel(...):
 - BC_SOLID (255): Solid obstacle voxel
 - BC_NONE (0): Regular fluid voxel with potential BCs or multi-res interactions
 """
+
 import nvtx
 import warp as wp
 import neon
@@ -100,6 +101,37 @@ def my_kernel(...):
 
 
 class MultiresIncompressibleNavierStokesStepper(Stepper):
+    """Multi-resolution incompressible Navier-Stokes stepper for the Neon backend.
+
+    Implements the full LBM step (stream, collide, boundary conditions) across
+    a hierarchy of grid levels using Neon containers.  Each container is a
+    compile-time specialized Warp kernel wrapped in a Neon execution-graph
+    node.
+
+    The stepper supports several performance optimization strategies (see
+    :class:`MresPerfOptimizationType`):
+
+    * **NAIVE_COLLIDE_STREAM** — separate collide and stream containers at
+      every level.
+    * **FUSION_AT_FINEST** — fused stream+collide at the finest level.
+    * **FUSION_AT_FINEST_SFV** — additionally splits SFV / CFV voxels at
+      the finest level for reduced branching.
+    * **FUSION_AT_FINEST_SFV_ALL** — SFV / CFV splitting at all levels.
+
+    Parameters
+    ----------
+    grid : NeonMultiresGrid
+        The multi-resolution grid.
+    boundary_conditions : list of BoundaryCondition
+        Boundary conditions to apply.
+    collision_type : str
+        Collision operator type: ``"BGK"`` or ``"KBC"``.
+    forcing_scheme : str
+        Forcing scheme name (only used when *force_vector* is given).
+    force_vector : array-like, optional
+        External body force vector.
+    """
+
     def __init__(
         self,
         grid,
@@ -172,6 +204,21 @@ def prepare_fields(self, rho, u, initializer=None):
         return f_0, f_1, bc_mask, missing_mask
 
     def prepare_coalescence_count(self, coalescence_factor, bc_mask):
+        """Precompute coalescence weighting factors for multi-resolution streaming.
+
+        For each non-halo voxel at every level, this method accumulates
+        the number of finer neighbours that contribute populations via
+        coalescence (child-to-parent transfer), then inverts the count
+        so that the streaming kernel can apply the correct averaging weight.
+
+        Parameters
+        ----------
+        coalescence_factor : field
+            Multi-resolution field to store the per-direction coalescence
+            weights (modified in-place).
+        bc_mask : field
+            Boundary-condition mask used to skip solid voxels.
+        """
         lattice_central_index = self.velocity_set.center_index
         num_levels = coalescence_factor.get_grid().num_levels
 
@@ -241,8 +288,12 @@ def compute(index: Any):
                         if not wp.neon_has_finer_ngh(coalescence_factor_pn, index, pull_direction):
                             pass
                         else:
+                            # Finer neighbour exists in the pull direction (opposite of l).
+                            # Read from the halo sitting on top of that finer neighbour.
                             if has_ngh_at_same_level:
-                                # Coalescence
+                                # Finer ngh in pull direction: YES
+                                # Same-level ngh:              YES
+                                # Compute coalescence factor
                                 if coalescence_factor > self.compute_dtype(0):
                                     coalescence_factor = self.compute_dtype(1) / (self.compute_dtype(2) * coalescence_factor)
                                     wp.neon_write(coalescence_factor_pn, index, l, coalescence_factor)
@@ -366,11 +417,15 @@ def apply_bc_impl(
                     if wp.static(is_post_streaming):
                         if wp.static(self.boundary_conditions[i].implementation_step == ImplementationStep.STREAMING):
                             if _boundary_id == wp.static(self.boundary_conditions[i].id):
-                                f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep, _missing_mask, f_0, f_1, f_pre, f_post)
+                                f_result = wp.static(self.boundary_conditions[i].neon_functional)(
+                                    index, timestep, _missing_mask, f_0, f_1, f_pre, f_post
+                                )
                     else:
                         if wp.static(self.boundary_conditions[i].implementation_step == ImplementationStep.COLLISION):
                             if _boundary_id == wp.static(self.boundary_conditions[i].id):
-                                f_result = wp.static(self.boundary_conditions[i].neon_functional)(index, timestep, _missing_mask, f_0, f_1, f_pre, f_post)
+                                f_result = wp.static(self.boundary_conditions[i].neon_functional)(
+                                    index, timestep, _missing_mask, f_0, f_1, f_pre, f_post
+                                )
                         if wp.static(self.boundary_conditions[i].id in extrapolation_outflow_bc_ids):
                             if _boundary_id == wp.static(self.boundary_conditions[i].id):
                                 f_result = wp.static(self.boundary_conditions[i].assemble_auxiliary_data)(
@@ -450,7 +505,9 @@ def collide_pipeline_impl(
                 _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, omega)
 
                 if wp.static(do_bc):
-                    _f_post_collision = apply_bc_post_collision(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision)
+                    _f_post_collision = apply_bc_post_collision(
+                        index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision
+                    )
                     neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
 
                 if wp.static(do_accumulation):
@@ -490,14 +547,26 @@ def neon_stream_explode_coalesce(
                 accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
 
                 if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
+                    # No finer ngh in the pull direction (opposite of l)
                     if not has_ngh_at_same_level:
+                        # No same-level ngh — could we have a coarser-level ngh?
                         if wp.neon_has_parent(f_0_pn, index):
+                            # Halo cell on top of us (parent exists)
                             has_a_coarser_ngh = wp.bool(False)
                             exploded_pop = wp.neon_lbm_read_coarser_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh)
                             if has_a_coarser_ngh:
+                                # No finer ngh in pull direction, no same-level ngh,
+                                # but a parent (ghost cell) exists with a coarser ngh
+                                # -> Explosion: read the exploded population from the
+                                #    coarser level's halo.
                                 _f_post_stream[l] = exploded_pop
                 else:
+                    # Finer ngh exists in the pull direction (opposite of l).
+                    # Read from the halo on top of that finer ngh.
                     if has_ngh_at_same_level:
+                        # Finer ngh in pull direction: YES
+                        # Same-level ngh:              YES
+                        # -> Coalescence
                         coalescence_factor = wp.neon_read(coalescence_factor_pn, index, l)
                         accumulated = accumulated * coalescence_factor
                         _f_post_stream[l] = accumulated
@@ -505,13 +574,7 @@ def neon_stream_explode_coalesce(
             return _f_post_stream
 
         @neon.Container.factory(name="collide_coarse")
-        def collide_coarse(level: int,
-                           f_0_fd: Any,
-                           f_1_fd: Any,
-                           bc_mask_fd: Any,
-                           missing_mask_fd: Any,
-                           omega: Any,
-                           timestep: int):
+        def collide_coarse(level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any, omega: Any, timestep: int):
             num_levels = f_0_fd.get_grid().num_levels
 
             def ll(loader: neon.Loader):
@@ -554,13 +617,7 @@ def device(index: Any):
             return ll
 
         @neon.Container.factory(name="SFV_collide_coarse")
-        def SFV_collide_coarse(level: int,
-                               f_0_fd: Any,
-                               f_1_fd: Any,
-                               bc_mask_fd: Any,
-                               missing_mask_fd: Any,
-                               omega: Any,
-                               timestep: int):
+        def SFV_collide_coarse(level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any, omega: Any, timestep: int):
             """Collision on SFV voxels only — no BCs, no multi-resolution accumulation."""
 
             def ll(loader: neon.Loader):
@@ -641,13 +698,7 @@ def device(index: Any):
             return ll
 
         @neon.Container.factory(name="stream_coarse_step_ABC")
-        def stream_coarse_step_ABC(level: int,
-                                   f_0_fd: Any,
-                                   f_1_fd: Any,
-                                   bc_mask_fd: Any,
-                                   missing_mask_fd: Any,
-                                   omega: Any,
-                                   timestep: int):
+        def stream_coarse_step_ABC(level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any, omega: Any, timestep: int):
             def ll(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
                 f_0_pn = loader.get_mres_read_handle(f_0_fd)
@@ -668,7 +719,9 @@ def device(index: Any):
                     _f_post_collision = _f0_thread
                     _f_post_stream = neon_stream_explode_coalesce(index, f_0_pn, coalescence_factor_pn)
 
-                    _f_post_stream = apply_bc_post_streaming(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream)
+                    _f_post_stream = apply_bc_post_streaming(
+                        index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream
+                    )
                     neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
 
                     for l in range(self.velocity_set.q):
@@ -679,13 +732,7 @@ def device(index: Any):
             return ll
 
         @neon.Container.factory(name="SFV_stream_coarse_step_ABC")
-        def SFV_stream_coarse_step_ABC(level: int,
-                                       f_0_fd: Any,
-                                       f_1_fd: Any,
-                                       bc_mask_fd: Any,
-                                       missing_mask_fd: Any,
-                                       omega: Any,
-                                       timestep: int):
+        def SFV_stream_coarse_step_ABC(level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any, omega: Any, timestep: int):
             """Stream on CFV voxels only — skips SFV and solid."""
 
             def ll(loader: neon.Loader):
@@ -710,7 +757,9 @@ def device(index: Any):
                     _f_post_collision = _f0_thread
                     _f_post_stream = neon_stream_explode_coalesce(index, f_0_pn, coalescence_factor_pn)
 
-                    _f_post_stream = apply_bc_post_streaming(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream)
+                    _f_post_stream = apply_bc_post_streaming(
+                        index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream
+                    )
                     neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
 
                     for l in range(self.velocity_set.q):
@@ -790,11 +839,7 @@ def cl_stream_coarse(index: Any):
             return ll_stream_coarse
 
         @neon.Container.factory(name="SFV_stream_coarse_step")
-        def SFV_stream_coarse_step(level: int,
-                                   f_0_fd: Any,
-                                   f_1_fd: Any,
-                                   bc_mask_fd: Any,
-                                   missing_mask_fd: Any):
+        def SFV_stream_coarse_step(level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any):
             def ll_stream_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
@@ -846,12 +891,18 @@ def neon_stream_finest_with_explosion(
                 wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
 
                 if not has_ngh_at_same_level:
+                    # No same-level ngh — could we have a coarser-level ngh?
                     if wp.neon_has_parent(f_0_pn, index):
+                        # Parent exists — try to read the exploded population from the coarser level
                         has_a_coarser_ngh = wp.bool(False)
                         exploded_pop = wp.neon_lbm_read_coarser_ngh(
                             explosion_src_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
                         )
                         if has_a_coarser_ngh:
+                            # No finer ngh in pull direction, no same-level ngh,
+                            # but a parent (ghost cell) exists with a coarser ngh
+                            # -> Explosion: read the exploded population from the
+                            #    coarser level's halo.
                             _f_post_stream[l] = exploded_pop
 
             return _f_post_stream
@@ -896,7 +947,9 @@ def device(index: Any):
                     _f_post_collision = _f0_thread
                     _f_post_stream = neon_stream_finest_with_explosion(index, f_0_pn, explosion_src_pn)
 
-                    _f_post_stream = apply_bc_post_streaming(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream)
+                    _f_post_stream = apply_bc_post_streaming(
+                        index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream
+                    )
 
                     collide_bc_accum(
                         index,
@@ -959,7 +1012,9 @@ def device(index: Any):
                     _f_post_collision = _f0_thread
                     _f_post_stream = neon_stream_finest_with_explosion(index, f_0_pn, explosion_src_pn)
 
-                    _f_post_stream = apply_bc_post_streaming(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream)
+                    _f_post_stream = apply_bc_post_streaming(
+                        index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream
+                    )
 
                     collide_bc_accum(
                         index,
@@ -1031,9 +1086,35 @@ def device(index: Any):
         }
 
     def launch_container(self, streamId, op_name, mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep):
+        """Immediately launch a single Neon container by name.
+
+        Parameters
+        ----------
+        streamId : int
+            CUDA stream index.
+        op_name : str
+            Key into the container dictionary returned by ``_construct_neon``.
+        mres_level : int
+            Grid level to execute on.
+        f_0, f_1 : field
+            Double-buffered distribution-function fields.
+        bc_mask, missing_mask : field
+            Boundary condition and missing-population masks.
+        omega : float
+            Relaxation parameter at this level.
+        timestep : int
+            Current simulation timestep.
+        """
         self.neon_container[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep).run(0)
 
     def add_to_app(self, **kwargs):
+        """Append a container invocation to the Neon skeleton application list.
+
+        Required keyword arguments are ``op_name`` (str) and ``app`` (list).
+        All remaining keyword arguments are forwarded to the container
+        factory for the given ``op_name``.  Argument validation is performed
+        before the call, and a ``ValueError`` is raised on mismatch.
+        """
         import inspect
 
         def validate_kwargs_forward(func, kwargs):
@@ -1108,6 +1189,7 @@ def validate_kwargs_forward(func, kwargs):
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_launch(self, f_0, f_1, bc_mask, missing_mask, omega, timestep):
+        """Execute a single LBM step through the Neon backend (direct launch)."""
         c = self.neon_container(f_0, f_1, bc_mask, missing_mask, omega, timestep)
         c.run(0)
         return f_0, f_1
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 05e59a17..bcb674b8 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -1,4 +1,10 @@
-# Base class for all stepper operators
+"""
+Single-resolution incompressible Navier-Stokes stepper.
+
+Implements the full LBM step (stream, collide, apply BCs) for a single-
+resolution grid.  Supports pull and push streaming schemes on JAX, a
+pull-only fused kernel on Warp, and a pull-only Neon container.
+"""
 
 from functools import partial
 
@@ -34,6 +40,29 @@
 
 
 class IncompressibleNavierStokesStepper(Stepper):
+    """Single-resolution incompressible Navier-Stokes LBM stepper.
+
+    Composes streaming, collision, equilibrium, macroscopic, and boundary-
+    condition operators into a complete timestep.
+
+    Parameters
+    ----------
+    grid : Grid
+        Computational grid.
+    boundary_conditions : list of BoundaryCondition
+        Boundary conditions to apply each step.
+    collision_type : str
+        ``"BGK"``, ``"KBC"``, or ``"SmagorinskyLESBGK"``.
+    streaming_scheme : str
+        ``"pull"`` (default) or ``"push"`` (JAX only).
+    forcing_scheme : str
+        Forcing scheme name (used when *force_vector* is given).
+    force_vector : array-like, optional
+        External body force vector.
+    backend_config : dict
+        Backend-specific options (e.g. Neon OCC configuration).
+    """
+
     def __init__(
         self,
         grid,
@@ -608,6 +637,7 @@ def neon_launch(self, f_0, f_1, bc_mask, missing_mask, omega, timestep):
         return f_0, f_1
 
     def prepare_skeleton(self, f_0, f_1, bc_mask, missing_mask, omega):
+        """Build the Neon odd/even skeletons for double-buffered time stepping."""
         grid = f_0.get_grid()
         bk = grid.backend
         self.neon_skeleton = {"odd": {}, "even": {}}
diff --git a/xlb/operator/stream/stream.py b/xlb/operator/stream/stream.py
index cda8547b..cd1466aa 100644
--- a/xlb/operator/stream/stream.py
+++ b/xlb/operator/stream/stream.py
@@ -1,4 +1,9 @@
-# Base class for all streaming operators
+"""
+Streaming operator for the Lattice Boltzmann Method.
+
+Implements the pull-scheme propagation step: each voxel reads populations
+from its lattice neighbours according to the velocity-set directions.
+"""
 
 from functools import partial
 import jax.numpy as jnp
@@ -11,8 +16,14 @@
 
 
 class Stream(Operator):
-    """
-    Base class for all streaming operators. This is used for pulling the distribution
+    """Pull-scheme streaming operator.
+
+    Propagates distribution functions by reading each population from the
+    upstream neighbour along the corresponding lattice direction.  Periodic
+    boundaries are applied automatically when a pull index falls outside
+    the domain (Warp backend only; JAX uses ``jnp.roll``).
+
+    Supports JAX, Warp, and Neon backends.
     """
 
     @Operator.register_backend(ComputeBackend.JAX)
diff --git a/xlb/precision_policy.py b/xlb/precision_policy.py
index 39e3e096..32a6d567 100644
--- a/xlb/precision_policy.py
+++ b/xlb/precision_policy.py
@@ -1,9 +1,18 @@
-# Enum for precision policy
+"""
+Precision and precision-policy enumerations for XLB.
+
+:class:`Precision` maps symbolic precisions to Warp and JAX dtypes.
+:class:`PrecisionPolicy` pairs a *compute* precision (used during
+arithmetic) with a *store* precision (used in memory), enabling
+mixed-precision simulations.
+"""
 
 from enum import Enum, auto
 
 
 class Precision(Enum):
+    """Scalar precision levels with Warp and JAX dtype accessors."""
+
     FP64 = auto()
     FP32 = auto()
     FP16 = auto()
@@ -46,6 +55,12 @@ def jax_dtype(self):
 
 
 class PrecisionPolicy(Enum):
+    """Mixed-precision policy pairing compute and store precisions.
+
+    The naming convention is ``<compute><store>``, e.g. ``FP32FP16``
+    computes in FP32 and stores results in FP16.
+    """
+
     FP64FP64 = auto()
     FP64FP32 = auto()
     FP64FP16 = auto()
diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index c84a4063..74e4796f 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -1,3 +1,16 @@
+"""
+Multi-resolution mesh utilities.
+
+Provides geometry preparation and I/O for multi-resolution LBM simulations:
+
+* :func:`make_cuboid_mesh` — builds a strongly-balanced cuboid mesh hierarchy
+  from an STL file and a sequence of domain multipliers.
+* :func:`prepare_sparsity_pattern` — converts level data into the sparsity
+  arrays required by :func:`multires_grid_factory`.
+* :class:`MultiresIO` — exports multi-resolution Neon field data to HDF5 /
+  XDMF, 2-D slice images, and 1-D line profiles.
+"""
+
 import numpy as np
 import trimesh
 from typing import Any, Optional
@@ -150,6 +163,17 @@ def make_cuboid_mesh(voxel_size, cuboids, stl_filename):
 
 
 class MultiresIO(object):
+    """I/O helper for multi-resolution Neon field data.
+
+    Converts hierarchical Neon ``mGrid`` fields into merged unstructured
+    hexahedral meshes and exports them as HDF5 + XDMF (for ParaView),
+    2-D slice PNG images, or 1-D line CSV profiles.
+
+    The constructor precomputes the merged geometry (coordinates,
+    connectivity, centroids) and allocates intermediate Warp fields so
+    that repeated exports only need to transfer data from the Neon fields.
+    """
+
     def __init__(
         self,
         field_name_cardinality_dict,
@@ -213,6 +237,19 @@ def __init__(
         self.container = self._construct_neon_container()
 
     def process_geometry(self, levels_data):
+        """Build merged coordinates and connectivity from all levels.
+
+        Returns
+        -------
+        coordinates : np.ndarray, shape (N, 3)
+            Vertex positions (8 per active voxel, before deduplication).
+        connectivity : np.ndarray, shape (M, 8)
+            Hexahedral connectivity (one row per active voxel).
+        level_id_field : np.ndarray, shape (M,)
+            Grid level index for each cell.
+        total_cells : int
+            Total number of active voxels across all levels.
+        """
         num_voxels_per_level = [np.sum(data) for data, _, _, _ in levels_data]
         num_points_per_level = [8 * nv for nv in num_voxels_per_level]
         point_id_offsets = np.cumsum([0] + num_points_per_level[:-1])
@@ -295,6 +332,7 @@ def _process_voxel_chunk(self, true_indices, origin, voxel_size, point_id_offset
         return corners, connectivity
 
     def save_xdmf(self, h5_filename, xmf_filename, total_cells, num_points, fields={}):
+        """Write an XDMF descriptor that references the companion HDF5 file."""
         # Generate an XDMF file to accompany the HDF5 file
         print(f"\tGenerating XDMF file: {xmf_filename}")
         hdf5_rel_path = h5_filename.split("/")[-1]
@@ -372,6 +410,11 @@ def save_hdf5_file(self, filename, coordinates, connectivity, level_id_field, fi
                 fg.create_dataset(fname, data=fdata.astype(np.float32), compression=compression, compression_opts=compression_opts, chunks=True)
 
     def _merge_duplicates(self, coordinates, connectivity, levels_data):
+        """Deduplicate vertices shared between adjacent voxels.
+
+        Uses spatial hashing (grid-snapped coordinates) processed in
+        chunks to keep memory bounded.
+        """
         # Merging duplicate points
         tolerance = 0.01
         chunk_size = 10_000_000  # Adjust based on GPU memory
@@ -404,12 +447,14 @@ def _merge_duplicates(self, coordinates, connectivity, levels_data):
         return coordinates, connectivity
 
     def _transform_coordinates(self, coordinates, offset):
+        """Convert lattice coordinates to physical units and apply offset."""
         offset = np.array(offset, dtype=np.float32)
         if self.unit_convertor is not None:
             coordinates = self.unit_convertor.length_to_physical(coordinates)
         return coordinates + offset
 
     def _prepare_container_inputs(self):
+        """Allocate dense Warp fields used as staging buffers for Neon-to-NumPy transfer."""
         # load necessary modules
         from xlb.compute_backend import ComputeBackend
         from xlb.grid import grid_factory
diff --git a/xlb/utils/utils.py b/xlb/utils/utils.py
index bb1f7782..d707fb5c 100644
--- a/xlb/utils/utils.py
+++ b/xlb/utils/utils.py
@@ -1,3 +1,11 @@
+"""
+General-purpose utilities for XLB.
+
+Includes helpers for field downsampling, VTK/image/USD I/O, geometry
+rotation, STL voxelization, Neon-to-JAX field transfer, and
+physical-to-lattice unit conversion.
+"""
+
 import numpy as np
 import matplotlib.pylab as plt
 from matplotlib import cm
@@ -320,9 +328,10 @@ def axangle2mat(axis, angle, is_normalized=False):
 
 
 class ToJAX(object):
+    """Convert a Neon field to a JAX array via an intermediate Warp grid."""
+
     def __init__(self, field_name, field_cardinality, grid_shape, store_precision=None):
-        """
-        Initialize the MultiresIO object.
+        """Initialise the Neon-to-JAX converter.
 
         Parameters
         ----------
@@ -332,8 +341,8 @@ def __init__(self, field_name, field_cardinality, grid_shape, store_precision=No
             The cardinality of the field to be converted.
         grid_shape : tuple
             The shape of the grid on which the field is defined.
-        store_precision : str, optional
-            The precision policy for storing data.
+        store_precision : Precision, optional
+            Storage precision.  Defaults to the global config value.
         """
         from xlb.compute_backend import ComputeBackend
         from xlb.grid import grid_factory
diff --git a/xlb/velocity_set/velocity_set.py b/xlb/velocity_set/velocity_set.py
index da3fc6f2..6f9634b2 100644
--- a/xlb/velocity_set/velocity_set.py
+++ b/xlb/velocity_set/velocity_set.py
@@ -1,4 +1,10 @@
-# Base Velocity Set class
+"""
+Base velocity-set class for the Lattice Boltzmann Method.
+
+Defines lattice directions, weights, and derived properties (opposite
+indices, moments, etc.) for any DdQq stencil.  Backend-specific constants
+(Warp vectors, JAX arrays, Neon lattice objects) are initialised lazily.
+"""
 
 import math
 import numpy as np

From ba92286330e662c36404cb61ad1df767a163d5a9 Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 13 Mar 2026 12:56:23 -0400
Subject: [PATCH 206/208] Fixed mixed precision handling of the Neon backend
 for single-res and multi-res by ensuring consistent use of `store_dtype` and
 `compute_dtype`.

---
 examples/cfd/multires_windtunnel_3d.py        |  4 +-
 xlb/helper/initializers.py                    |  6 +--
 .../bc_extrapolation_outflow.py               |  2 +-
 .../bc_halfway_bounce_back.py                 |  4 +-
 xlb/operator/boundary_condition/bc_hybrid.py  |  2 +-
 .../multires_quadratic_equilibrium.py         |  6 +--
 .../equilibrium/quadratic_equilibrium.py      |  7 ++--
 xlb/operator/force/momentum_transfer.py       |  4 +-
 xlb/operator/macroscopic/macroscopic.py       |  6 +--
 .../macroscopic/multires_macroscopic.py       |  6 +--
 xlb/operator/stepper/nse_multires_stepper.py  | 42 +++++++++----------
 xlb/operator/stepper/nse_stepper.py           |  5 +--
 xlb/operator/stream/stream.py                 |  2 +-
 13 files changed, 46 insertions(+), 50 deletions(-)

diff --git a/examples/cfd/multires_windtunnel_3d.py b/examples/cfd/multires_windtunnel_3d.py
index ad02a3c5..d94d4357 100644
--- a/examples/cfd/multires_windtunnel_3d.py
+++ b/examples/cfd/multires_windtunnel_3d.py
@@ -478,8 +478,8 @@ def _load_sim_line(csv_path):
 print(f"Coarsest voxel size: {delta_x_coarse} meters")
 print(f"Total voxels: {sum(np.count_nonzero(mask) for mask in sparsity_pattern):,}")
 print(f"Total active voxels: {total_voxels:,}")
-print(f"Active voxels per level: {active_voxels}")
-print(f"Solid voxels per level: {solid_voxels}")
+print(f"Active voxels per level: {[int(v) for v in active_voxels]}")
+print(f"Solid voxels per level: {[int(v) for v in solid_voxels]}")
 print(f"Total lattice updates per global step: {total_lattice_updates_per_step:,}")
 print(f"Number of refinement levels: {num_levels}")
 print(f"Physical inlet velocity: {wind_speed_mps:.4f} m/s")
diff --git a/xlb/helper/initializers.py b/xlb/helper/initializers.py
index 78e9e83b..063dcafa 100644
--- a/xlb/helper/initializers.py
+++ b/xlb/helper/initializers.py
@@ -179,18 +179,18 @@ def functional_local(index: Any, bc_mask: Any, f_field: Any):
             if self.read_field(bc_mask, index, 0) == bc_id:
                 _f_init = self.initialization_operator.warp_functional(_rho, _u)
                 for l in range(_q):
-                    self.write_field(f_field, index, l, _f_init[l])
+                    self.write_field(f_field, index, l, self.store_dtype(_f_init[l]))
             else:
                 # In the rest of the domain, we assume zero velocity and equilibrium distribution.
                 for l in range(_q):
-                    self.write_field(f_field, index, l, _w[l])
+                    self.write_field(f_field, index, l, self.store_dtype(_w[l]))
 
         @wp.func
         def functional_domain(index: Any, bc_mask: Any, f_field: Any):
             # If bc_id is -1, initialize the entire domain according to the custom initialization operator for the given velocity
             _f_init = self.initialization_operator.warp_functional(_rho, _u)
             for l in range(_q):
-                self.write_field(f_field, index, l, _f_init[l])
+                self.write_field(f_field, index, l, self.store_dtype(_f_init[l]))
 
         # Set the functional based on whether we are initializing a specific BC or the entire domain
         functional = functional_local if self.bc_id != -1 else functional_domain
diff --git a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
index 4ff8bbfa..a1a26e0b 100644
--- a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
+++ b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
@@ -238,7 +238,7 @@ def assemble_auxiliary_data_neon(
                     # The following is the post-streaming values of the neighbor cell
                     # This function reads a field value at a given neighboring index and direction.
                     unused_is_valid = wp.bool(False)
-                    f_aux = self.compute_dtype(wp.neon_read_ngh(f_0, index, offset_pull_index, lattice_dir, self.compute_dtype(0.0), unused_is_valid))
+                    f_aux = self.compute_dtype(wp.neon_read_ngh(f_0, index, offset_pull_index, lattice_dir, self.store_dtype(0.0), unused_is_valid))
                     _f[_opp_indices[lattice_dir]] = (self.compute_dtype(1.0) - sound_speed) * _f_pre[lattice_dir] + sound_speed * f_aux
             return _f
 
diff --git a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
index 2404b48a..3ee31584 100644
--- a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
@@ -89,11 +89,11 @@ def __init__(
             if self.compute_backend in [ComputeBackend.WARP, ComputeBackend.NEON]:
                 if self.velocity_set.d == 2:
                     prescribed_value = np.array([prescribed_value[0], prescribed_value[1], 0.0], dtype=np.float64)
-                prescribed_value = wp.vec(3, dtype=self.store_dtype)(prescribed_value)
+                prescribed_value = wp.vec(3, dtype=self.compute_dtype)(prescribed_value)
             self.profile = self._create_constant_prescribed_profile(prescribed_value)
 
     def _create_constant_prescribed_profile(self, prescribed_value):
-        _u_vec = wp.vec(3, dtype=self.precision_policy.store_precision.wp_dtype)
+        _u_vec = wp.vec(3, dtype=self.compute_dtype)
 
         @wp.func
         def prescribed_profile_warp(index: Any, time: Any):
diff --git a/xlb/operator/boundary_condition/bc_hybrid.py b/xlb/operator/boundary_condition/bc_hybrid.py
index 12a9f4e4..62584160 100644
--- a/xlb/operator/boundary_condition/bc_hybrid.py
+++ b/xlb/operator/boundary_condition/bc_hybrid.py
@@ -149,7 +149,7 @@ def __init__(
                 prescribed_value = np.array([prescribed_value[0], prescribed_value[1], 0.0], dtype=np.float64)
 
             # create a constant prescribed profile
-            _u_vec = wp.vec(3, dtype=self.store_dtype)
+            _u_vec = wp.vec(3, dtype=self.compute_dtype)
             prescribed_value = _u_vec(prescribed_value)
 
             @wp.func
diff --git a/xlb/operator/equilibrium/multires_quadratic_equilibrium.py b/xlb/operator/equilibrium/multires_quadratic_equilibrium.py
index 101d4c4b..cc289e41 100644
--- a/xlb/operator/equilibrium/multires_quadratic_equilibrium.py
+++ b/xlb/operator/equilibrium/multires_quadratic_equilibrium.py
@@ -48,8 +48,8 @@ def quadratic_equilibrium_ll(loader: neon.Loader):
                 def quadratic_equilibrium_cl(index: Any):
                     _u = _u_vec()
                     for d in range(self.velocity_set.d):
-                        _u[d] = wp.neon_read(u_pn, index, d)
-                    _rho = wp.neon_read(rho_pn, index, 0)
+                        _u[d] = self.compute_dtype(wp.neon_read(u_pn, index, d))
+                    _rho = self.compute_dtype(wp.neon_read(rho_pn, index, 0))
                     feq = functional(_rho, _u)
 
                     if wp.neon_has_child(f_pn, index):
@@ -57,7 +57,7 @@ def quadratic_equilibrium_cl(index: Any):
                             feq[l] = self.compute_dtype(0.0)
                     # Set the output
                     for l in range(self.velocity_set.q):
-                        wp.neon_write(f_pn, index, l, feq[l])
+                        wp.neon_write(f_pn, index, l, self.store_dtype(feq[l]))
 
                 loader.declare_kernel(quadratic_equilibrium_cl)
 
diff --git a/xlb/operator/equilibrium/quadratic_equilibrium.py b/xlb/operator/equilibrium/quadratic_equilibrium.py
index 316bf1ab..890df126 100644
--- a/xlb/operator/equilibrium/quadratic_equilibrium.py
+++ b/xlb/operator/equilibrium/quadratic_equilibrium.py
@@ -128,14 +128,13 @@ def quadratic_equilibrium_ll(loader: neon.Loader):
                 def quadratic_equilibrium_cl(index: typing.Any):
                     _u = _u_vec()
                     for d in range(self.velocity_set.d):
-                        _u[d] = wp.neon_read(u_pn, index, d)
-                    _rho = wp.neon_read(rho_pn, index, 0)
+                        _u[d] = self.compute_dtype(wp.neon_read(u_pn, index, d))
+                    _rho = self.compute_dtype(wp.neon_read(rho_pn, index, 0))
                     feq = functional(_rho, _u)
 
                     # Set the output
                     for l in range(self.velocity_set.q):
-                        # wp.neon_write(f_pn, index, l, self.store_dtype(feq[l]))
-                        wp.neon_write(f_pn, index, l, feq[l])
+                        wp.neon_write(f_pn, index, l, self.store_dtype(feq[l]))
 
                 loader.declare_kernel(quadratic_equilibrium_cl)
 
diff --git a/xlb/operator/force/momentum_transfer.py b/xlb/operator/force/momentum_transfer.py
index c5de920c..c24ab349 100644
--- a/xlb/operator/force/momentum_transfer.py
+++ b/xlb/operator/force/momentum_transfer.py
@@ -290,7 +290,7 @@ def kernel(
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(self, f_0, f_1, bc_mask, missing_mask):
         # Ensure the force is initialized to zero
-        self.force *= 0.0
+        self.force *= self.compute_dtype(0.0)
 
         # Define the warp functionals needed for this operation
         self.fetcher_functional = self.fetcher.warp_functional
@@ -350,7 +350,7 @@ def neon_implementation(
         stream=0,
     ):
         # Ensure the force is initialized to zero
-        self.force *= 0.0
+        self.force *= self.compute_dtype(0.0)
 
         # Define the neon functionals needed for this operation
         self.fetcher_functional = self.fetcher.neon_functional
diff --git a/xlb/operator/macroscopic/macroscopic.py b/xlb/operator/macroscopic/macroscopic.py
index 00d0076c..63d25ff4 100644
--- a/xlb/operator/macroscopic/macroscopic.py
+++ b/xlb/operator/macroscopic/macroscopic.py
@@ -94,11 +94,11 @@ def macroscopic_ll(loader: neon.Loader):
                 def macroscopic_cl(gIdx: typing.Any):
                     _f = _f_vec()
                     for l in range(self.velocity_set.q):
-                        _f[l] = wp.neon_read(f, gIdx, l)
+                        _f[l] = self.compute_dtype(wp.neon_read(f, gIdx, l))
                     _rho, _u = functional(_f)
-                    wp.neon_write(rho, gIdx, 0, _rho)
+                    wp.neon_write(rho, gIdx, 0, self.store_dtype(_rho))
                     for d in range(_d):
-                        wp.neon_write(u, gIdx, d, _u[d])
+                        wp.neon_write(u, gIdx, d, self.store_dtype(_u[d]))
 
                 loader.declare_kernel(macroscopic_cl)
 
diff --git a/xlb/operator/macroscopic/multires_macroscopic.py b/xlb/operator/macroscopic/multires_macroscopic.py
index b841c295..b754fb43 100644
--- a/xlb/operator/macroscopic/multires_macroscopic.py
+++ b/xlb/operator/macroscopic/multires_macroscopic.py
@@ -61,7 +61,7 @@ def macroscopic_cl(gIdx: typing.Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, gIdx, 0)
 
                     for l in range(self.velocity_set.q):
-                        _f[l] = wp.neon_read(f, gIdx, l)
+                        _f[l] = self.compute_dtype(wp.neon_read(f, gIdx, l))
 
                     _rho, _u = functional(_f)
 
@@ -70,9 +70,9 @@ def macroscopic_cl(gIdx: typing.Any):
                         for d in range(_d):
                             _u[d] = self.compute_dtype(0.0)
 
-                    wp.neon_write(rho, gIdx, 0, _rho)
+                    wp.neon_write(rho, gIdx, 0, self.store_dtype(_rho))
                     for d in range(_d):
-                        wp.neon_write(u, gIdx, d, _u[d])
+                        wp.neon_write(u, gIdx, d, self.store_dtype(_u[d]))
 
                 loader.declare_kernel(macroscopic_cl)
 
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index a42f3891..059f4f41 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -242,7 +242,7 @@ def cl_collide_coarse(index: Any):
                         for l in range(self.velocity_set.q):
                             if level < num_levels - 1:
                                 push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
-                                val = self.compute_dtype(1)
+                                val = self.store_dtype(1)
                                 wp.neon_mres_lbm_store_op(coalescence_factor_pn, index, l, push_direction, val)
 
                 loader.declare_kernel(cl_collide_coarse)
@@ -281,8 +281,8 @@ def compute(index: Any):
                         pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
 
                         has_ngh_at_same_level = wp.bool(False)
-                        coalescence_factor = wp.neon_read_ngh(
-                            coalescence_factor_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level
+                        coalescence_factor = self.compute_dtype(
+                            wp.neon_read_ngh(coalescence_factor_pn, index, pull_direction, l, self.store_dtype(0), has_ngh_at_same_level)
                         )
 
                         if not wp.neon_has_finer_ngh(coalescence_factor_pn, index, pull_direction):
@@ -296,7 +296,7 @@ def compute(index: Any):
                                 # Compute coalescence factor
                                 if coalescence_factor > self.compute_dtype(0):
                                     coalescence_factor = self.compute_dtype(1) / (self.compute_dtype(2) * coalescence_factor)
-                                    wp.neon_write(coalescence_factor_pn, index, l, coalescence_factor)
+                                    wp.neon_write(coalescence_factor_pn, index, l, self.store_dtype(coalescence_factor))
 
                 loader.declare_kernel(compute)
 
@@ -514,11 +514,11 @@ def collide_pipeline_impl(
                     for l in range(self.velocity_set.q):
                         push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
                         if level < num_levels - 1:
-                            wp.neon_mres_lbm_store_op(accumulation_pn, index, l, push_direction, _f_post_collision[l])
-                        wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+                            wp.neon_mres_lbm_store_op(accumulation_pn, index, l, push_direction, self.store_dtype(_f_post_collision[l]))
+                        wp.neon_write(f_1_pn, index, l, self.store_dtype(_f_post_collision[l]))
                 else:
                     for l in range(self.velocity_set.q):
-                        wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+                        wp.neon_write(f_1_pn, index, l, self.store_dtype(_f_post_collision[l]))
 
                 return _f_post_collision
 
@@ -544,7 +544,7 @@ def neon_stream_explode_coalesce(
                 pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
 
                 has_ngh_at_same_level = wp.bool(False)
-                accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
+                accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.store_dtype(0), has_ngh_at_same_level)
 
                 if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
                     # No finer ngh in the pull direction (opposite of l)
@@ -553,13 +553,13 @@ def neon_stream_explode_coalesce(
                         if wp.neon_has_parent(f_0_pn, index):
                             # Halo cell on top of us (parent exists)
                             has_a_coarser_ngh = wp.bool(False)
-                            exploded_pop = wp.neon_lbm_read_coarser_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh)
+                            exploded_pop = wp.neon_lbm_read_coarser_ngh(f_0_pn, index, pull_direction, l, self.store_dtype(0), has_a_coarser_ngh)
                             if has_a_coarser_ngh:
                                 # No finer ngh in pull direction, no same-level ngh,
                                 # but a parent (ghost cell) exists with a coarser ngh
                                 # -> Explosion: read the exploded population from the
                                 #    coarser level's halo.
-                                _f_post_stream[l] = exploded_pop
+                                _f_post_stream[l] = self.compute_dtype(exploded_pop)
                 else:
                     # Finer ngh exists in the pull direction (opposite of l).
                     # Read from the halo on top of that finer ngh.
@@ -569,7 +569,7 @@ def neon_stream_explode_coalesce(
                         # -> Coalescence
                         coalescence_factor = wp.neon_read(coalescence_factor_pn, index, l)
                         accumulated = accumulated * coalescence_factor
-                        _f_post_stream[l] = accumulated
+                        _f_post_stream[l] = self.compute_dtype(accumulated)
 
             return _f_post_stream
 
@@ -610,7 +610,7 @@ def device(index: Any):
                         )
                     else:
                         for l in range(self.velocity_set.q):
-                            wp.neon_write(f_1_pn, index, l, self.compute_dtype(0))
+                            wp.neon_write(f_1_pn, index, l, self.store_dtype(0))
 
                 loader.declare_kernel(device)
 
@@ -691,7 +691,7 @@ def device(index: Any):
                         )
                     else:
                         for l in range(self.velocity_set.q):
-                            wp.neon_write(f_1_pn, index, l, self.compute_dtype(0))
+                            wp.neon_write(f_1_pn, index, l, self.store_dtype(0))
 
                 loader.declare_kernel(device)
 
@@ -725,7 +725,7 @@ def device(index: Any):
                     neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
 
                     for l in range(self.velocity_set.q):
-                        wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
+                        wp.neon_write(f_1_pn, index, l, self.store_dtype(_f_post_stream[l]))
 
                 loader.declare_kernel(device)
 
@@ -763,7 +763,7 @@ def device(index: Any):
                     neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
 
                     for l in range(self.velocity_set.q):
-                        wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
+                        wp.neon_write(f_1_pn, index, l, self.store_dtype(_f_post_stream[l]))
 
                 loader.declare_kernel(device)
 
@@ -816,13 +816,13 @@ def cl_stream_coarse(index: Any):
                         pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
 
                         has_ngh_at_same_level = wp.bool(False)
-                        wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
+                        wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.store_dtype(0), has_ngh_at_same_level)
 
                         if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
                             if not has_ngh_at_same_level:
                                 if wp.neon_has_parent(f_0_pn, index):
                                     has_a_coarser_ngh = wp.bool(False)
-                                    wp.neon_lbm_read_coarser_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh)
+                                    wp.neon_lbm_read_coarser_ngh(f_0_pn, index, pull_direction, l, self.store_dtype(0), has_a_coarser_ngh)
                                     if has_a_coarser_ngh:
                                         # Explosion: not an SFV
                                         return
@@ -867,7 +867,7 @@ def cl_stream_coarse(index: Any):
                     _f_post_stream = self.stream.neon_functional(f_0_pn, index)
 
                     for l in range(self.velocity_set.q):
-                        wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
+                        wp.neon_write(f_1_pn, index, l, self.store_dtype(_f_post_stream[l]))
 
                 loader.declare_kernel(cl_stream_coarse)
 
@@ -888,7 +888,7 @@ def neon_stream_finest_with_explosion(
                 pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
 
                 has_ngh_at_same_level = wp.bool(False)
-                wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
+                wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.store_dtype(0), has_ngh_at_same_level)
 
                 if not has_ngh_at_same_level:
                     # No same-level ngh — could we have a coarser-level ngh?
@@ -896,14 +896,14 @@ def neon_stream_finest_with_explosion(
                         # Parent exists — try to read the exploded population from the coarser level
                         has_a_coarser_ngh = wp.bool(False)
                         exploded_pop = wp.neon_lbm_read_coarser_ngh(
-                            explosion_src_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
+                            explosion_src_pn, index, pull_direction, l, self.store_dtype(0), has_a_coarser_ngh
                         )
                         if has_a_coarser_ngh:
                             # No finer ngh in pull direction, no same-level ngh,
                             # but a parent (ghost cell) exists with a coarser ngh
                             # -> Explosion: read the exploded population from the
                             #    coarser level's halo.
-                            _f_post_stream[l] = exploded_pop
+                            _f_post_stream[l] = self.compute_dtype(exploded_pop)
 
             return _f_post_stream
 
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index bcb674b8..d0dcf09a 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -481,7 +481,6 @@ def _construct_neon(self):
         _missing_mask_vec = wp.vec(self.velocity_set.q, dtype=wp.uint8)
         _opp_indices = self.velocity_set.opp_indices
         lattice_central_index = self.velocity_set.center_index
-        # _cast_to_store_dtype = self.store_dtype()
 
         # Read the list of bc_to_id created upon instantiation
         bc_to_id = boundary_condition_registry.bc_to_id
@@ -577,8 +576,6 @@ def container(
             omega: Any,
             timestep: int,
         ):
-            cast_to_store_dtype = self.store_dtype
-
             def nse_stepper_ll(loader: neon.Loader):
                 loader.set_grid(bc_mask_fd.get_grid())
 
@@ -620,7 +617,7 @@ def nse_stepper_cl(index: Any):
 
                     # Store the result in f_1
                     for l in range(self.velocity_set.q):
-                        wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+                        wp.neon_write(f_1_pn, index, l, self.store_dtype(_f_post_collision[l]))
 
                 loader.declare_kernel(nse_stepper_cl)
 
diff --git a/xlb/operator/stream/stream.py b/xlb/operator/stream/stream.py
index cd1466aa..7cfe2d19 100644
--- a/xlb/operator/stream/stream.py
+++ b/xlb/operator/stream/stream.py
@@ -143,7 +143,7 @@ def functional(
                 unused_is_valid = wp.bool(False)
 
                 # Read the distribution function from the neighboring cell in the pull direction
-                _f[l] = wp.neon_read_ngh(f, index, ngh, l, self.compute_dtype(0), unused_is_valid)
+                _f[l] = self.compute_dtype(wp.neon_read_ngh(f, index, ngh, l, self.store_dtype(0), unused_is_valid))
             return _f
 
         return functional, None

From 0f01e8cae0b820ba17fa92cdfdaafa4c7dea3799 Mon Sep 17 00:00:00 2001
From: massimim <57805133+massimim@users.noreply.github.com>
Date: Fri, 13 Mar 2026 21:40:45 +0100
Subject: [PATCH 207/208] (refactoring) Allowing Warp backend to run without
 neon to be installed + README update (#39)

---
 README.md                                             | 11 +++++++++++
 examples/cfd/windtunnel_3d.py                         |  6 +++---
 xlb/grid/grid.py                                      |  5 +++--
 xlb/helper/initializers.py                            |  3 ++-
 xlb/helper/simulation_manager.py                      |  3 ++-
 xlb/operator/boundary_condition/boundary_condition.py |  1 -
 .../boundary_condition/helper_functions_bc.py         |  3 ++-
 xlb/operator/boundary_masker/aabb.py                  |  3 ++-
 .../boundary_masker/indices_boundary_masker.py        |  3 ++-
 xlb/operator/boundary_masker/multires_aabb.py         |  3 ++-
 xlb/operator/boundary_masker/multires_aabb_close.py   |  3 ++-
 .../multires_indices_boundary_masker.py               |  3 ++-
 xlb/operator/boundary_masker/multires_ray.py          |  3 ++-
 xlb/operator/boundary_masker/ray.py                   |  3 ++-
 .../equilibrium/multires_quadratic_equilibrium.py     |  3 ++-
 xlb/operator/equilibrium/quadratic_equilibrium.py     |  3 +--
 xlb/operator/force/momentum_transfer.py               |  3 ++-
 xlb/operator/force/multires_momentum_transfer.py      |  3 ++-
 xlb/operator/macroscopic/macroscopic.py               |  2 +-
 xlb/operator/macroscopic/multires_macroscopic.py      |  2 +-
 xlb/operator/operator.py                              |  4 +++-
 xlb/operator/stepper/nse_multires_stepper.py          |  3 ++-
 xlb/operator/stepper/nse_stepper.py                   |  4 +++-
 xlb/utils/mesher.py                                   |  2 +-
 24 files changed, 55 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md
index f383fb20..2459ec6a 100644
--- a/README.md
+++ b/README.md
@@ -28,6 +28,17 @@ This installation is for the JAX backend with TPU support:
 pip install "xlb[tpu]"
 ```
 
+### Installation with Neon support 
+Neon backend enables multi-GPU dense and single-GPU multi-resolution representations.
+At the moment Neon depends on an ad-hoc fork of warp-lang, therefore we need to first remove any current warp installation before installing Neon. The Python interface for Neon can be fetched from a wheel stored on GitHub. 
+
+```bash
+pip uninstall warp-lang
+pip install https://github.com/Autodesk/Neon/releases/download/v0.5.2a1/neon_gpu-0.5.2a1-cp312-cp312-linux_x86_64.whl
+```
+
+
+
 ### Notes:
 - For Mac users: Use the basic CPU installation command as JAX's GPU support is not available on MacOS
 - The NVIDIA Warp backend is included in all installation options and supports CUDA automatically when available
diff --git a/examples/cfd/windtunnel_3d.py b/examples/cfd/windtunnel_3d.py
index 2d7ecf1f..9909132c 100644
--- a/examples/cfd/windtunnel_3d.py
+++ b/examples/cfd/windtunnel_3d.py
@@ -21,7 +21,6 @@
 import matplotlib.pyplot as plt
 from xlb.operator.boundary_masker import MeshVoxelizationMethod
 
-import neon
 
 # -------------------------- Simulation Setup --------------------------
 
@@ -30,7 +29,7 @@
 grid_shape = (grid_size_x, grid_size_y, grid_size_z)
 
 # Simulation Configuration
-compute_backend = ComputeBackend.NEON
+compute_backend = ComputeBackend.WARP
 precision_policy = PrecisionPolicy.FP32FP32
 
 velocity_set = xlb.velocity_set.D3Q27(precision_policy=precision_policy, compute_backend=compute_backend)
@@ -111,7 +110,8 @@
 
 
 # Configure backend options:
-backend_config = {"occ": neon.SkeletonConfig.OCC.from_string("standard"), "device_list": [0, 1]} if compute_backend == ComputeBackend.NEON else {}
+# backend_config = {"occ": neon.SkeletonConfig.OCC.from_string("standard"), "device_list": [0, 1]} if compute_backend == ComputeBackend.NEON else {}
+backend_config = {}
 
 # Setup Stepper
 stepper = IncompressibleNavierStokesStepper(
diff --git a/xlb/grid/grid.py b/xlb/grid/grid.py
index 90aeb2c6..7ae9236f 100644
--- a/xlb/grid/grid.py
+++ b/xlb/grid/grid.py
@@ -14,7 +14,6 @@
 
 from xlb import DefaultConfig
 from xlb.compute_backend import ComputeBackend
-import neon
 
 
 def grid_factory(
@@ -64,8 +63,10 @@ def multires_grid_factory(
     compute_backend: ComputeBackend = None,
     velocity_set=None,
     sparsity_pattern_list: List[np.ndarray] = [],
-    sparsity_pattern_origins: List[neon.Index_3d] = [],
+    sparsity_pattern_origins=[],
 ):
+    import neon
+
     """Create a multi-resolution grid (Neon backend only).
 
     Parameters
diff --git a/xlb/helper/initializers.py b/xlb/helper/initializers.py
index 063dcafa..bda54e11 100644
--- a/xlb/helper/initializers.py
+++ b/xlb/helper/initializers.py
@@ -20,7 +20,6 @@
 from xlb.compute_backend import ComputeBackend
 from xlb.operator.equilibrium import QuadraticEquilibrium
 from xlb.operator.equilibrium import MultiresQuadraticEquilibrium
-import neon
 
 
 def initialize_eq(f, grid, velocity_set, precision_policy, compute_backend, rho=None, u=None):
@@ -221,6 +220,8 @@ def warp_implementation(self, bc_mask, f_field):
         return f_field
 
     def _construct_neon(self):
+        import neon
+
         # Use the warp functional for the NEON backend
         functional, _ = self._construct_warp()
 
diff --git a/xlb/helper/simulation_manager.py b/xlb/helper/simulation_manager.py
index da7b2a32..ad60b939 100644
--- a/xlb/helper/simulation_manager.py
+++ b/xlb/helper/simulation_manager.py
@@ -7,7 +7,6 @@
 interleaves coarse and fine grid updates.
 """
 
-import neon
 import warp as wp
 from xlb.operator.stepper import MultiresIncompressibleNavierStokesStepper
 from xlb.operator.macroscopic import MultiresMacroscopic
@@ -171,6 +170,8 @@ def _build_recursion(self, level, app, config):
             self.add_to_app(app=app, op_name=op_name, level=level, **fields_swapped, **extra)
 
     def _construct_stepper_skeleton(self):
+        import neon
+
         """Build the Neon skeleton that encodes the recursive time-stepping order.
 
         The skeleton is a list of Neon container invocations that, when
diff --git a/xlb/operator/boundary_condition/boundary_condition.py b/xlb/operator/boundary_condition/boundary_condition.py
index 4d44dcc6..4ac2a96a 100644
--- a/xlb/operator/boundary_condition/boundary_condition.py
+++ b/xlb/operator/boundary_condition/boundary_condition.py
@@ -21,7 +21,6 @@
 from xlb.operator.boundary_condition.boundary_condition_registry import boundary_condition_registry
 from xlb.operator.boundary_condition import HelperFunctionsBC
 from xlb.operator.boundary_masker.mesh_voxelization_method import MeshVoxelizationMethod
-import neon
 
 
 class ImplementationStep(Enum):
diff --git a/xlb/operator/boundary_condition/helper_functions_bc.py b/xlb/operator/boundary_condition/helper_functions_bc.py
index 1f981219..c613573b 100644
--- a/xlb/operator/boundary_condition/helper_functions_bc.py
+++ b/xlb/operator/boundary_condition/helper_functions_bc.py
@@ -15,7 +15,6 @@
 from typing import Any, Callable
 
 import warp as wp
-import neon
 
 from xlb.velocity_set.velocity_set import VelocitySet
 from xlb.precision_policy import PrecisionPolicy
@@ -500,6 +499,8 @@ def kernel(
         return functional_dict, kernel
 
     def _construct_neon(self):
+        import neon
+
         """
         Constructs the Neon container for encoding auxiliary data recovery.
         """
diff --git a/xlb/operator/boundary_masker/aabb.py b/xlb/operator/boundary_masker/aabb.py
index 98513a9b..94def41b 100644
--- a/xlb/operator/boundary_masker/aabb.py
+++ b/xlb/operator/boundary_masker/aabb.py
@@ -6,7 +6,6 @@
 """
 
 import warp as wp
-import neon
 from typing import Any
 from xlb.velocity_set.velocity_set import VelocitySet
 from xlb.precision_policy import PrecisionPolicy
@@ -143,6 +142,8 @@ def warp_implementation(
         )
 
     def _construct_neon(self):
+        import neon
+
         # Use the warp functional for the NEON backend
         functional, _ = self._construct_warp()
 
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index da4d5c80..3384d71c 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -12,7 +12,6 @@
 import jax.numpy as jnp
 import numpy as np
 import warp as wp
-import neon
 
 from xlb.compute_backend import ComputeBackend
 from xlb.grid import grid_factory
@@ -414,6 +413,8 @@ def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
         return bc_mask, missing_mask
 
     def _construct_neon(self):
+        import neon
+
         # Use the warp functional for the NEON backend
         functional_dict, _ = self._construct_warp()
         functional_domain_bounds = functional_dict.get("functional_domain_bounds")
diff --git a/xlb/operator/boundary_masker/multires_aabb.py b/xlb/operator/boundary_masker/multires_aabb.py
index f2aa50c5..f9cc5887 100644
--- a/xlb/operator/boundary_masker/multires_aabb.py
+++ b/xlb/operator/boundary_masker/multires_aabb.py
@@ -9,7 +9,6 @@
 from xlb.compute_backend import ComputeBackend
 from xlb.operator.boundary_masker import MeshMaskerAABB
 from xlb.operator.operator import Operator
-import neon
 
 
 class MultiresMeshMaskerAABB(MeshMaskerAABB):
@@ -87,6 +86,8 @@ def neon_implementation(
         missing_mask,
         stream=0,
     ):
+        import neon
+
         # Prepare inputs
         mesh_id, bc_id = self._prepare_kernel_inputs(bc, bc_mask)
 
diff --git a/xlb/operator/boundary_masker/multires_aabb_close.py b/xlb/operator/boundary_masker/multires_aabb_close.py
index ffd50904..df461706 100644
--- a/xlb/operator/boundary_masker/multires_aabb_close.py
+++ b/xlb/operator/boundary_masker/multires_aabb_close.py
@@ -6,7 +6,6 @@
 """
 
 import warp as wp
-import neon
 from typing import Any
 from xlb.velocity_set.velocity_set import VelocitySet
 from xlb.precision_policy import PrecisionPolicy
@@ -40,6 +39,8 @@ def __init__(
         self.neon_functional_dict, self.neon_container_dict = self._construct_neon()
 
     def _construct_neon(self):
+        import neon
+
         # Use the warp functionals from the base (for reference), but implement NEON variants here
         functional_dict_warp, _ = self._construct_warp()
         functional_erode_warp = functional_dict_warp.get("functional_erode")
diff --git a/xlb/operator/boundary_masker/multires_indices_boundary_masker.py b/xlb/operator/boundary_masker/multires_indices_boundary_masker.py
index 6223cb1b..bf7fc1d7 100644
--- a/xlb/operator/boundary_masker/multires_indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/multires_indices_boundary_masker.py
@@ -16,7 +16,6 @@
 from xlb.precision_policy import PrecisionPolicy
 from xlb.compute_backend import ComputeBackend
 from xlb.operator.boundary_masker import IndicesBoundaryMasker
-import neon
 
 
 class MultiresIndicesBoundaryMasker(IndicesBoundaryMasker):
@@ -139,6 +138,8 @@ def interior_missing_mask_kernel(index: Any):
 
     @Operator.register_backend(ComputeBackend.NEON)
     def neon_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
+        import neon
+
         grid = bc_mask.get_grid()
         num_levels = grid.num_levels
         grid_shape_finest = self.helper_masker.get_grid_shape(bc_mask)
diff --git a/xlb/operator/boundary_masker/multires_ray.py b/xlb/operator/boundary_masker/multires_ray.py
index 8719e33c..9a5a01d8 100644
--- a/xlb/operator/boundary_masker/multires_ray.py
+++ b/xlb/operator/boundary_masker/multires_ray.py
@@ -9,7 +9,6 @@
 from xlb.compute_backend import ComputeBackend
 from xlb.operator.boundary_masker import MeshMaskerRay
 from xlb.operator.operator import Operator
-import neon
 
 
 class MultiresMeshMaskerRay(MeshMaskerRay):
@@ -78,6 +77,8 @@ def neon_implementation(
         missing_mask,
         stream=0,
     ):
+        import neon
+
         # Prepare inputs
         mesh_id, bc_id = self._prepare_kernel_inputs(bc, bc_mask)
 
diff --git a/xlb/operator/boundary_masker/ray.py b/xlb/operator/boundary_masker/ray.py
index 5f44a2c2..f5a2d96b 100644
--- a/xlb/operator/boundary_masker/ray.py
+++ b/xlb/operator/boundary_masker/ray.py
@@ -12,7 +12,6 @@
 from xlb.compute_backend import ComputeBackend
 from xlb.operator.boundary_masker.mesh_boundary_masker import MeshBoundaryMasker
 from xlb.operator.operator import Operator
-import neon
 
 
 class MeshMaskerRay(MeshBoundaryMasker):
@@ -120,6 +119,8 @@ def warp_implementation(
         )
 
     def _construct_neon(self):
+        import neon
+
         # Use the warp functional for the NEON backend
         functional, _ = self._construct_warp()
 
diff --git a/xlb/operator/equilibrium/multires_quadratic_equilibrium.py b/xlb/operator/equilibrium/multires_quadratic_equilibrium.py
index cc289e41..0a3bb959 100644
--- a/xlb/operator/equilibrium/multires_quadratic_equilibrium.py
+++ b/xlb/operator/equilibrium/multires_quadratic_equilibrium.py
@@ -3,7 +3,6 @@
 """
 
 import warp as wp
-import neon
 from typing import Any
 from xlb.compute_backend import ComputeBackend
 from xlb.operator.equilibrium import QuadraticEquilibrium
@@ -24,6 +23,8 @@ def __init__(self, *args, **kwargs):
             raise NotImplementedError(f"Operator {self.__class__.__name__} not supported in {self.compute_backend} backend.")
 
     def _construct_neon(self):
+        import neon
+
         # Use the warp functional for the NEON backend
         functional, _ = self._construct_warp()
 
diff --git a/xlb/operator/equilibrium/quadratic_equilibrium.py b/xlb/operator/equilibrium/quadratic_equilibrium.py
index 890df126..5ec14618 100644
--- a/xlb/operator/equilibrium/quadratic_equilibrium.py
+++ b/xlb/operator/equilibrium/quadratic_equilibrium.py
@@ -4,7 +4,6 @@
 import warp as wp
 import os
 
-import neon
 from typing import Any
 
 from xlb.compute_backend import ComputeBackend
@@ -104,7 +103,7 @@ def warp_implementation(self, rho, u, f):
         return f
 
     def _construct_neon(self):
-        import neon, typing
+        import neon
 
         # Use the warp functional for the NEON backend
         functional, _ = self._construct_warp()
diff --git a/xlb/operator/force/momentum_transfer.py b/xlb/operator/force/momentum_transfer.py
index c24ab349..05952d7d 100644
--- a/xlb/operator/force/momentum_transfer.py
+++ b/xlb/operator/force/momentum_transfer.py
@@ -10,7 +10,6 @@
 from xlb.compute_backend import ComputeBackend
 from xlb.operator.operator import Operator
 from xlb.operator.stream import Stream
-import neon
 
 
 # Enum used to keep track of LBM operations
@@ -304,6 +303,8 @@ def warp_implementation(self, f_0, f_1, bc_mask, missing_mask):
         return self.force.numpy()[0]
 
     def _construct_neon(self):
+        import neon
+
         # Use the warp functional for the NEON backend
         functional, _ = self._construct_warp()
 
diff --git a/xlb/operator/force/multires_momentum_transfer.py b/xlb/operator/force/multires_momentum_transfer.py
index 7622e467..f7e04a43 100644
--- a/xlb/operator/force/multires_momentum_transfer.py
+++ b/xlb/operator/force/multires_momentum_transfer.py
@@ -5,7 +5,6 @@
 from typing import Any
 
 import warp as wp
-import neon
 
 from xlb.velocity_set.velocity_set import VelocitySet
 from xlb.precision_policy import PrecisionPolicy
@@ -76,6 +75,8 @@ def __init__(
         super().__init__(no_slip_bc_instance, operation_sequence, velocity_set, precision_policy, compute_backend)
 
     def _construct_neon(self):
+        import neon
+
         # Use the warp functional for the NEON backend
         functional, _ = self._construct_warp()
 
diff --git a/xlb/operator/macroscopic/macroscopic.py b/xlb/operator/macroscopic/macroscopic.py
index 63d25ff4..971081c9 100644
--- a/xlb/operator/macroscopic/macroscopic.py
+++ b/xlb/operator/macroscopic/macroscopic.py
@@ -64,7 +64,7 @@ def warp_implementation(self, f, rho, u):
         return rho, u
 
     def _construct_neon(self):
-        import neon, typing
+        import neon
 
         # Redefine the zero and first moment operators for the neon backend
         # This is because the neon backend relies on the warp functionals for its operations.
diff --git a/xlb/operator/macroscopic/multires_macroscopic.py b/xlb/operator/macroscopic/multires_macroscopic.py
index b754fb43..b7df10c7 100644
--- a/xlb/operator/macroscopic/multires_macroscopic.py
+++ b/xlb/operator/macroscopic/multires_macroscopic.py
@@ -28,7 +28,7 @@ def __init__(self, *args, **kwargs):
             raise NotImplementedError(f"Operator {self.__class__.__name__} not supported in {self.compute_backend} backend.")
 
     def _construct_neon(self):
-        import neon, typing
+        import neon
 
         # Redefine the zero and first moment operators for the neon backend
         # This is because the neon backend relies on the warp functionals for its operations.
diff --git a/xlb/operator/operator.py b/xlb/operator/operator.py
index 2511b159..4405708c 100644
--- a/xlb/operator/operator.py
+++ b/xlb/operator/operator.py
@@ -250,6 +250,7 @@ def write_field(
                 field[direction, index[0], index[1], index[2]] = value
 
         elif self.compute_backend == ComputeBackend.NEON:
+            import neon
 
             @wp.func
             def read_field(
@@ -279,7 +280,6 @@ def _construct_read_field_neighbor(self):
         """
         Construct a function to read a field value at a neighboring index along a given direction.
         """
-        from neon.multires.mPartition import neon_get_type
 
         if self.compute_backend == ComputeBackend.WARP:
 
@@ -295,6 +295,8 @@ def read_field_neighbor(
                 return field[direction, neighbor[0], neighbor[1], neighbor[2]]
 
         elif self.compute_backend == ComputeBackend.NEON:
+            import neon
+            # from neon.multires.mPartition import neon_get_type
 
             @wp.func
             def read_field_neighbor(
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 059f4f41..629f1b07 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -68,7 +68,6 @@ def my_kernel(...):
 
 import nvtx
 import warp as wp
-import neon
 from typing import Any
 
 from xlb import DefaultConfig
@@ -157,6 +156,8 @@ def __init__(
         self.macroscopic = MultiresMacroscopic(self.velocity_set, self.precision_policy, self.compute_backend)
 
     def prepare_fields(self, rho, u, initializer=None):
+        import neon
+
         """Prepare the fields required for the stepper.
 
         Args:
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index d0dcf09a..751e58dc 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -10,7 +10,6 @@
 
 from jax import jit
 import warp as wp
-import neon
 from typing import Any
 
 from xlb import DefaultConfig
@@ -476,6 +475,8 @@ def warp_implementation(self, f_0, f_1, bc_mask, missing_mask, omega, timestep):
         return f_0, f_1
 
     def _construct_neon(self):
+        import neon
+
         # Set local constants
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
         _missing_mask_vec = wp.vec(self.velocity_set.q, dtype=wp.uint8)
@@ -546,6 +547,7 @@ def neon_apply_aux_recovery_bc(
             f_0_pn: Any,
             f_1_pn: Any,
         ):
+
             # Note:
             # In XLB, the BC auxiliary data (e.g. prescribed values of pressure or normal velocity) are stored in (i) central index of f_1 and/or
             # (ii) missing directions of f_1. Some BCs may or may not need all these available storage space. This function checks whether
diff --git a/xlb/utils/mesher.py b/xlb/utils/mesher.py
index 74e4796f..8b849b87 100644
--- a/xlb/utils/mesher.py
+++ b/xlb/utils/mesher.py
@@ -15,7 +15,6 @@
 import trimesh
 from typing import Any, Optional
 
-import neon
 import warp as wp
 from xlb.utils.utils import UnitConvertor
 
@@ -483,6 +482,7 @@ def _construct_neon_container(self):
         Constructs a NEON container for exporting multi-resolution data to HDF5.
         This container will be used to transfer multi-resolution NEON fields into stacked warp fields.
         """
+        import neon
 
         @neon.Container.factory(name="HDF5MultiresExporter")
         def container(

From 3413ac3a3618b158976657019545096925780e9a Mon Sep 17 00:00:00 2001
From: Hesam Salehipour <hesam.salehipour@autodesk.com>
Date: Fri, 13 Mar 2026 17:31:38 -0400
Subject: [PATCH 208/208] Ensuring all pytests are passing.

---
 README.md                                                | 3 +--
 .../bc_equilibrium/test_bc_equilibrium_warp.py           | 2 +-
 .../test_bc_fullway_bounce_back_warp.py                  | 2 +-
 .../mask/test_bc_indices_masker_warp.py                  | 4 ++--
 xlb/operator/boundary_masker/indices_boundary_masker.py  | 9 ++++++---
 xlb/operator/stepper/nse_stepper.py                      | 1 -
 6 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 2459ec6a..65f312cc 100644
--- a/README.md
+++ b/README.md
@@ -29,8 +29,7 @@ pip install "xlb[tpu]"
 ```
 
 ### Installation with Neon support 
-Neon backend enables multi-GPU dense and single-GPU multi-resolution representations.
-At the moment Neon depends on an ad-hoc fork of warp-lang, therefore we need to first remove any current warp installation before installing Neon. The Python interface for Neon can be fetched from a wheel stored on GitHub. 
+Neon backend enables multi-GPU dense and single-GPU multi-resolution representations. Neon depends on a custom fork of warp-lang, so any existing warp installation must be removed before installing Neon. The Python interface for Neon can be installed from a pre-built wheel hosted on GitHub. Note that the wheel currently requires GLIBC >= 2.38 (e.g., Ubuntu 24.04 or later).
 
 ```bash
 pip uninstall warp-lang
diff --git a/tests/boundary_conditions/bc_equilibrium/test_bc_equilibrium_warp.py b/tests/boundary_conditions/bc_equilibrium/test_bc_equilibrium_warp.py
index 07e68cf4..711c34bf 100644
--- a/tests/boundary_conditions/bc_equilibrium/test_bc_equilibrium_warp.py
+++ b/tests/boundary_conditions/bc_equilibrium/test_bc_equilibrium_warp.py
@@ -30,7 +30,7 @@ def test_bc_equilibrium_warp(dim, velocity_set, grid_shape):
     my_grid = grid_factory(grid_shape)
     velocity_set = DefaultConfig.velocity_set
 
-    missing_mask = my_grid.create_field(cardinality=velocity_set.q, dtype=xlb.Precision.BOOL)
+    missing_mask = my_grid.create_field(cardinality=velocity_set.q, dtype=xlb.Precision.UINT8)
 
     bc_mask = my_grid.create_field(cardinality=1, dtype=xlb.Precision.UINT8)
 
diff --git a/tests/boundary_conditions/bc_fullway_bounce_back/test_bc_fullway_bounce_back_warp.py b/tests/boundary_conditions/bc_fullway_bounce_back/test_bc_fullway_bounce_back_warp.py
index 3cc15cb3..4f5d6757 100644
--- a/tests/boundary_conditions/bc_fullway_bounce_back/test_bc_fullway_bounce_back_warp.py
+++ b/tests/boundary_conditions/bc_fullway_bounce_back/test_bc_fullway_bounce_back_warp.py
@@ -33,7 +33,7 @@ def test_fullway_bounce_back_warp(dim, velocity_set, grid_shape):
     my_grid = grid_factory(grid_shape)
     velocity_set = DefaultConfig.velocity_set
 
-    missing_mask = my_grid.create_field(cardinality=velocity_set.q, dtype=xlb.Precision.BOOL)
+    missing_mask = my_grid.create_field(cardinality=velocity_set.q, dtype=xlb.Precision.UINT8)
 
     bc_mask = my_grid.create_field(cardinality=1, dtype=xlb.Precision.UINT8)
 
diff --git a/tests/boundary_conditions/mask/test_bc_indices_masker_warp.py b/tests/boundary_conditions/mask/test_bc_indices_masker_warp.py
index 4ec0639e..cb012cee 100644
--- a/tests/boundary_conditions/mask/test_bc_indices_masker_warp.py
+++ b/tests/boundary_conditions/mask/test_bc_indices_masker_warp.py
@@ -32,7 +32,7 @@ def test_indices_masker_warp(dim, velocity_set, grid_shape):
     my_grid = grid_factory(grid_shape)
     velocity_set = DefaultConfig.velocity_set
 
-    missing_mask = my_grid.create_field(cardinality=velocity_set.q, dtype=xlb.Precision.BOOL)
+    missing_mask = my_grid.create_field(cardinality=velocity_set.q, dtype=xlb.Precision.UINT8)
 
     bc_mask = my_grid.create_field(cardinality=1, dtype=xlb.Precision.UINT8)
 
@@ -61,7 +61,7 @@ def test_indices_masker_warp(dim, velocity_set, grid_shape):
         bc_mask,
         missing_mask,
     )
-    assert missing_mask.dtype == xlb.Precision.BOOL.wp_dtype
+    assert missing_mask.dtype == xlb.Precision.UINT8.wp_dtype
 
     assert bc_mask.dtype == xlb.Precision.UINT8.wp_dtype
 
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 3384d71c..e888c284 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -115,9 +115,12 @@ def jax_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
 
                 # The missing mask is set to True meaning (exterior or solid nodes) using the original indices.
                 # This is because of the following streaming step which will assign missing directions for the boundary nodes.
-                missing_mask_extended = missing_mask_extended.at[:, solid_indices_shifted[0], solid_indices_shifted[1], solid_indices_shifted[2]].set(
-                    True
-                )
+                if dim == 2:
+                    missing_mask_extended = missing_mask_extended.at[:, solid_indices_shifted[0], solid_indices_shifted[1]].set(True)
+                else:
+                    missing_mask_extended = missing_mask_extended.at[
+                        :, solid_indices_shifted[0], solid_indices_shifted[1], solid_indices_shifted[2]
+                    ].set(True)
             else:
                 indices_shifted = bc_indices - indices_origin + shift
 
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 751e58dc..475958ac 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -547,7 +547,6 @@ def neon_apply_aux_recovery_bc(
             f_0_pn: Any,
             f_1_pn: Any,
         ):
-
             # Note:
             # In XLB, the BC auxiliary data (e.g. prescribed values of pressure or normal velocity) are stored in (i) central index of f_1 and/or
             # (ii) missing directions of f_1. Some BCs may or may not need all these available storage space. This function checks whether