diff --git a/Accelerated_Python_User_Guide/notebooks/2D_Navier_Stokes_In_Warp.ipynb b/Accelerated_Python_User_Guide/notebooks/2D_Navier_Stokes_In_Warp.ipynb
new file mode 100644
index 00000000..a1efbdea
--- /dev/null
+++ b/Accelerated_Python_User_Guide/notebooks/2D_Navier_Stokes_In_Warp.ipynb
@@ -0,0 +1,1332 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# SPDX-License-Identifier: Apache-2.0 AND CC-BY-NC-4.0\n",
+    "#\n",
+    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "# http://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# How to use NVIDIA Warp to Build GPU-Accelerated Computational Physics Simulations\n",
+    "\n",
+    "## Overview\n",
+    "\n",
+    "In this notebook, we will build a fluid solver entirely in Python using the Warp framework. Specifically, we will simulate 2D turbulent flow in a box.\n",
+    "\n",
+    "Through this implementation, you will learn:\n",
+    "\n",
+    "* Writing SIMT (single instruction, multiple threads) kernels for building blocks of a solver\n",
+    "* Using Warp's tile-based programming primitives to accelerate matrix operations like FFT, inverse FFT\n",
+    "* CUDA graph capture to reduce launch overheads and make timestepping more efficient\n",
+    "\n",
+    "By the end of this lab, we will have built a GPU-accelerated 2D Navier-Stokes solver while gaining hands-on experience applying Warp's features to computational physics problems."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## Setup\n",
+    "\n",
+    "Before we start the lab, let's ensure we have all the necessary packages installed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install Warp\n",
+    "%pip install warp-lang\n",
+    "\n",
+    "# Install visualization package\n",
+    "%pip install matplotlib Pillow"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now let's import the necessary libraries and initialize Warp to check if GPU support is available:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import warp as wp\n",
+    "\n",
+    "if wp.get_cuda_device_count() > 0:\n",
+    "    print(\"GPU detected successfully\")\n",
+    "else:\n",
+    "    print(\"No GPU detected!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## What is a solver?\n",
+    "\n",
+    "<div align=\"center\">\n",
+    "<svg width=\"800\" height=\"400\" xmlns=\"http://www.w3.org/2000/svg\">\n",
+    "  <defs>\n",
+    "    <style>\n",
+    "      .input-box { fill: #E8F4F8; stroke: #2C3E50; stroke-width: 2; }\n",
+    "      .solver-box { fill: #FFE5B4; stroke: #2C3E50; stroke-width: 3; }\n",
+    "      .output-box { fill: #E8F5E9; stroke: #2C3E50; stroke-width: 2; }\n",
+    "      .text-main { font-family: Arial, sans-serif; font-size: 16px; font-weight: bold; fill: #2C3E50; text-anchor: middle; }\n",
+    "      .text-large { font-family: Arial, sans-serif; font-size: 22px; font-weight: bold; fill: #2C3E50; text-anchor: middle; }\n",
+    "      .text-label { font-family: Arial, sans-serif; font-size: 18px; font-weight: bold; fill: #2C3E50; text-anchor: middle; font-style: italic; }\n",
+    "      .arrow { stroke: #34495E; stroke-width: 3; fill: none; marker-end: url(#arrowhead); }\n",
+    "    </style>\n",
+    "    <marker id=\"arrowhead\" markerWidth=\"10\" markerHeight=\"10\" refX=\"9\" refY=\"3\" orient=\"auto\">\n",
+    "      <polygon points=\"0 0, 10 3, 0 6\" fill=\"#34495E\" />\n",
+    "    </marker>\n",
+    "  </defs>\n",
+    "  <rect width=\"800\" height=\"400\" fill=\"white\"/>\n",
+    "  <text x=\"120\" y=\"30\" class=\"text-label\">INPUTS</text>\n",
+    "  <rect x=\"30\" y=\"60\" width=\"180\" height=\"70\" rx=\"10\" class=\"input-box\"/>\n",
+    "  <text x=\"120\" y=\"90\" class=\"text-main\">Governing</text>\n",
+    "  <text x=\"120\" y=\"110\" class=\"text-main\">Equations</text>\n",
+    "  <rect x=\"30\" y=\"150\" width=\"180\" height=\"70\" rx=\"10\" class=\"input-box\"/>\n",
+    "  <text x=\"120\" y=\"180\" class=\"text-main\">Initial</text>\n",
+    "  <text x=\"120\" y=\"200\" class=\"text-main\">Conditions</text>\n",
+    "  <rect x=\"30\" y=\"240\" width=\"180\" height=\"70\" rx=\"10\" class=\"input-box\"/>\n",
+    "  <text x=\"120\" y=\"270\" class=\"text-main\">Boundary</text>\n",
+    "  <text x=\"120\" y=\"290\" class=\"text-main\">Conditions</text>\n",
+    "  <path d=\"M 210 95 L 300 185\" class=\"arrow\"/>\n",
+    "  <path d=\"M 210 185 L 300 185\" class=\"arrow\"/>\n",
+    "  <path d=\"M 210 275 L 300 185\" class=\"arrow\"/>\n",
+    "  <rect x=\"300\" y=\"130\" width=\"200\" height=\"110\" rx=\"15\" class=\"solver-box\"/>\n",
+    "  <text x=\"400\" y=\"195\" class=\"text-large\">SOLVER</text>\n",
+    "  <path d=\"M 500 185 L 570 185\" class=\"arrow\"/>\n",
+    "  <text x=\"660\" y=\"30\" class=\"text-label\">OUTPUT</text>\n",
+    "  <rect x=\"570\" y=\"150\" width=\"180\" height=\"70\" rx=\"10\" class=\"output-box\"/>\n",
+    "  <text x=\"660\" y=\"180\" class=\"text-main\">Fields of</text>\n",
+    "  <text x=\"660\" y=\"200\" class=\"text-main\">Interest</text>\n",
+    "</svg>\n",
+    "</div>\n",
+    " \n",
+    "At its core, a **solver** is a program that computes how something evolves over time, space, or both by discretizing governing equations on a grid. It takes three inputs and gives you the physical fields as a function of space and time.\n",
+    "\n",
+    "1. **Governing equations** \u2013 the mathematical description of the physics (e.g., Newton's laws, heat equation, fluid dynamics)\n",
+    "2. **Initial conditions** \u2013 the starting state of the system (e.g., initial temperature distribution, initial velocity field)\n",
+    "3. **Boundary conditions** \u2013 what happens at the edges of the domain (e.g., walls, inlets, periodic boundaries)\n",
+    "\n",
+    "<p align=\"center\">\n",
+    "  <img src=\"images/chapter-12.2/structured_vs_unstructured_grid.png\" alt=\"Structured vs Unstructured Grid\" width=\"50%\"><br>\n",
+    "  <em>Structured vs. unstructured grids</em>\n",
+    "</p>\n",
+    "\n",
+    "\n",
+    "**Computational Fluid Dynamics (CFD)** is a sub-field of computational physics where solvers almost always simulate the Navier-Stokes equations, at times in conjunction with other physics like heat transfer or plasma dynamics. These simulations are notoriously compute- and memory-intensive. A typical industry-grade CFD simulation involves:\n",
+    " \n",
+    "- Millions (sometimes billions) of grid points, each storing velocity, pressure, temperature, etc.\n",
+    "- Thousands of timesteps, each requiring calculations at every grid point\n",
+    "- Stencil operations where each point depends on its neighbors\n",
+    " \n",
+    "Here's the key insight --  **updating each grid point is largely independent of the others within a single timestep**. This is a perfect match for GPUs, which excel at running thousands of threads in parallel.\n",
+    "\n",
+    "### Where Does Warp Come Into the Picture? \n",
+    "\n",
+    "Warp bridges the gap between Python's ease of use and GPU performance. Instead of writing low-level CUDA, you write Python code decorated with `@wp.kernel`, and Warp compiles it to efficient GPU code. This lets us prototype and iterate quickly while still getting GPU acceleration where it matters."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## What are we building today?\n",
+    "\n",
+    "![Turbulence simulation](images/chapter-12.2/turbulence_256x256_Re1000.gif)\n",
+    "\n",
+    "In today's lab, we will **not** build an industry-grade solver! Instead, we'll simulate a classic academic problem of simulating **decaying turbulence** governed by the 2-D Navier-Stokes equations, formulated **in terms of vorticity and stream function**, on a **periodic domain**.\n",
+    "\n",
+    "- **Vorticity** $\\omega$: Measures local rotation of the fluid.\n",
+    "- **Stream function** $\\psi$: Defines velocity through its derivatives. This formulation ensures that the continuity equation is automatically satisfied.\n",
+    "\n",
+    "By using $\\psi$ and $\\omega$, we reduce the problem to solving the **vorticity transport equation** with only **two unknowns $\\omega$ and $\\psi$**\n",
+    "\n",
+    "$$\\underbrace{\\frac{\\partial \\omega}{\\partial t}}_{\\text{time evolution}} + \\underbrace{\\frac{\\partial \\psi}{\\partial y}\\frac{\\partial \\omega}{\\partial x} - \\frac{\\partial \\psi}{\\partial x}\\frac{\\partial \\omega}{\\partial y}}_{\\text{advection}} = \\underbrace{\\frac{1}{\\text{Re}}\\nabla^2 \\omega}_{\\text{diffusion}} \\tag{1}.$$\n",
+    "\n",
+    "The $\\omega$ and $\\psi$ satisfy the Poisson equation \n",
+    "$$\\nabla^2 \\psi = -\\omega \\tag{2}.$$\n",
+    "\n",
+    "<p align=\"center\">\n",
+    "<img src=\"images/chapter-12.2/vorticity_grid_overlay.png\" alt=\"Vorticity field with grid overlay\" width=\"50%\"> \n",
+    "</p>\n",
+    "\n",
+    "We solve these equations on a structured $N \\times N$ grid with periodic boundary conditions. The red dots indicate where we store the values of $\\omega$ and $\\psi$.\n",
+    "\n",
+    "- We solve **Eq 1** using finite differences (**SIMT kernels**)\n",
+    "- We solve **Eq 2** using FFT in Fourier space (**Tile kernels**)\n",
+    "\n",
+    "The solver loop then takes the form shown below. What remains is to build the inner workings of our solver.\n",
+    "\n",
+    "<p align=\"center\">\n",
+    "   <img src=\"images/chapter-12.2/navier_stokes_timestep.svg\" alt=\"Solver timestep loop\" width=\"600\">\n",
+    "</p>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## Building blocks of the solver\n",
+    "\n",
+    "Let's zoom in now on the inner workings of our solver. Our simple solver would have two main building blocks.\n",
+    "\n",
+    "1. **Discretization + Time Marching**: Discretize the advection and diffusion terms to obtain the RHS $\\mathcal{L}(\\omega)$ of $\\frac{\\partial \\omega}{\\partial t} = \\mathcal{L}(\\omega)$ (equation 1), then advance $\\omega(t)$ to $\\omega(t+\\Delta t)$ using simple approximation of \n",
+    "\n",
+    "$$\\frac{\\partial \\omega}{\\partial t} \\approx \\frac{\\omega^{(n+1)} - \\omega^{n}}{\\Delta t}.$$\n",
+    "\n",
+    "2. **Poisson Solve**: Solve Poisson equation (equation 2) for $\\psi$ given $\\omega$ - **details when we get there**.\n",
+    "\n",
+    "<p align=\"center\">\n",
+    "  <img src=\"images/chapter-12.2/solver_pipeline_simple.svg\" alt=\"Solver Pipeline: One Timestep\" width=\"800\">\n",
+    "</p>\n",
+    "\n",
+    "In pseudo-code, one timestep of our solver looks like:\n",
+    "\n",
+    "    def step(omega, psi):\n",
+    "        # === Block 1: Discretization + Time-stepping (SIMT) ===\n",
+    "        rhs = (1/Re) * diffusion(omega) - advection(omega, psi)\n",
+    "        omega_new = omega + dt * rhs\n",
+    "\n",
+    "        # === Block 2: Poisson Solve (Tile) ===\n",
+    "        psi = poisson_solve(omega_new)    # details in Block 2\n",
+    "\n",
+    "Here are the **6 kernels/functions** that you will implement in this lab:\n",
+    "\n",
+    "**Building Block 1: Discretization + Time-stepping (SIMT)**\n",
+    "\n",
+    "| Exercise                   | Type         | Warp Concept                 |\n",
+    "| -------------------------- | ------------ | ---------------------------- |\n",
+    "| `advection()`              | `@wp.func`   | Stencil access pattern       |\n",
+    "| `diffusion()`              | `@wp.func`   | Stencil access pattern       |\n",
+    "| `viscous_advection_kernel` | `@wp.kernel` | SIMT kernel, combining funcs |\n",
+    "\n",
+    "**Building Block 2: FFT Poisson Solver (Tile)**\n",
+    "\n",
+    "| Exercise     | Type         | Warp Concept                                   |\n",
+    "| ------------ | ------------ | ---------------------------------------------- |\n",
+    "| `fft_tiled`  | `@wp.kernel` | `wp.tile_load`, `wp.tile_fft`, `wp.tile_store` |\n",
+    "| `ifft_tiled` | `@wp.kernel` | `wp.tile_ifft`                                 |\n",
+    "| `transpose`  | `@wp.kernel` | SIMT kernel                                    |\n",
+    "\n",
+    "*Helper kernels (pre-provided, not exercises): `copy_float_to_vec2`, `multiply_k2_inverse`, `extract_real_and_scale`*\n",
+    "\n",
+    "In what follows, we will tackle Block 1 first, then Block 2. In the end, we will put them together to complete the full solver loop."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## Simulation parameters\n",
+    "\n",
+    "First, let's define the global parameters for our simulation. We will solve the vorticity-transport equation on a square domain of size $2\\pi$, discretized on a $1024 \\times 1024$ grid. We take a timestep of $\\Delta t = 5 \\times 10^{-4}$."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Grid resolution\n",
+    "N_GRID = 1024\n",
+    "\n",
+    "# Box size\n",
+    "LEN = 2 * np.pi\n",
+    "\n",
+    "# Delta t for timestepping\n",
+    "DT = 0.0005\n",
+    "\n",
+    "# Reynolds number\n",
+    "RE = 1000.0\n",
+    "\n",
+    "# h = l/N (size of a computational cell)\n",
+    "H = LEN / N_GRID"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get device object\n",
+    "device = wp.get_device(\"cuda:1\")\n",
+    "\n",
+    "# Free memory (in bytes)\n",
+    "free_mem_mb = device.free_memory / (1024 * 1024)\n",
+    "print(f\"Free GPU memory: {free_mem_mb:.2f} MB\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's also allocate some helper Warp arrays that we will make use of throughout the notebook. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# allocate warp arrays for vorticity, stream-function, and RHS of NS equation\n",
+    "\n",
+    "omega_0 = wp.zeros((N_GRID, N_GRID), dtype=wp.float32)\n",
+    "omega_1 = wp.zeros((N_GRID, N_GRID), dtype=wp.float32)\n",
+    "psi = wp.zeros((N_GRID, N_GRID), dtype=wp.float32)\n",
+    "rhs = wp.zeros((N_GRID, N_GRID), dtype=wp.float32)\n",
+    "\n",
+    "# precompute 1/(kx^2 + ky^2) for spectral Poisson solver (avoid division by zero at k=0)\n",
+    "k = np.fft.fftfreq(N_GRID, d=1.0 / N_GRID)\n",
+    "kx, ky = np.meshgrid(k, k)\n",
+    "k2 = kx**2 + ky**2\n",
+    "k2i_np = np.zeros_like(k2)\n",
+    "nonzero = k2 != 0\n",
+    "k2i_np[nonzero] = 1.0 / k2[nonzero]\n",
+    "\n",
+    "k2i = wp.array2d(k2i_np.astype(np.float32), dtype=wp.float32)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Exercise: Run the cell below and see how the memory footprint of our GPU has changed after the initial allocatons."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get device object\n",
+    "device = wp.get_device(\"cuda:0\")\n",
+    "\n",
+    "# Free memory\n",
+    "free_mem_mb = device.free_memory / (1024 * 1024)\n",
+    "print(f\"Free GPU memory: {free_mem_mb:.2f} MB\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's also initialize $\\omega$ on the grid."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Visualize the initial vorticity field for decaying turbulence\n",
+    "from utils import initialize_decaying_turbulence, plot_vorticity_field\n",
+    "\n",
+    "# Initialize vorticity field for decaying turbulence\n",
+    "from utils import initialize_decaying_turbulence\n",
+    "\n",
+    "omega_init = initialize_decaying_turbulence(n_grid=N_GRID, seed=42)\n",
+    "\n",
+    "# Copy to Warp arrays\n",
+    "omega_0 = wp.array(omega_init, dtype=wp.float32)\n",
+    "omega_1 = wp.array(omega_init, dtype=wp.float32)\n",
+    "\n",
+    "# Plot the initial vorticity field\n",
+    "fig, ax = plot_vorticity_field(\n",
+    "    omega_init, title=\"Initial Vorticity Field (Decaying Turbulence)\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## Building block 1: Discretization + Time Marching\n",
+    "\n",
+    "<p align=\"center\">\n",
+    "  <img src=\"images/chapter-12.2/solver_pipeline_simple_discretization.svg\" alt=\"Solver pipeline highlighting discretization step\" width=\"800\">\n",
+    "</p>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "### Building block 1.1: Compute RHS\n",
+    "Recall the vorticity transport equation (1) from the Background section:\n",
+    "\n",
+    "$$\\frac{\\partial \\omega}{\\partial t} + \\underbrace{\\frac{\\partial \\psi}{\\partial y}\\frac{\\partial \\omega}{\\partial x} - \\frac{\\partial \\psi}{\\partial x}\\frac{\\partial \\omega}{\\partial y}}_{\\text{advection}} = \\underbrace{\\frac{1}{\\text{Re}}\\left(\\frac{\\partial^2 \\omega}{\\partial x^2} + \\frac{\\partial^2 \\omega}{\\partial y^2}\\right)}_{\\text{diffusion}}. \\tag{1}$$\n",
+    "\n",
+    "\n",
+    "We discretize the **advection** and **diffusion** terms on a **uniform grid** using **central finite difference schemes**. Discretization means approximating the derivatives using field values at the grid cells.\n",
+    "\n",
+    "#### Advection term\n",
+    "\n",
+    "Using central differences for first derivatives with grid spacing $h$\n",
+    "\n",
+    "<p align=\"center\">\n",
+    "  <img src=\"images/chapter-12.2/omega_differential.png\" alt=\"Omega stencil\" width=\"400\">\n",
+    "  <img src=\"images/chapter-12.2/psi_differential.png\" alt=\"Psi stencil\" width=\"400\">\n",
+    "</p>\n",
+    "\n",
+    "$$\\text{Advection}_{i,j} = \\frac{\\partial \\omega}{\\partial x} \\cdot \\frac{\\partial \\psi}{\\partial y} - \\frac{\\partial \\omega}{\\partial y} \\cdot \\frac{\\partial \\psi}{\\partial x}.$$\n",
+    "\n",
+    "#### Diffusion (or Laplacian) term\n",
+    "\n",
+    "$$\\text{Diffusion}_{i,j} ={\\frac{\\omega_{i+1,j} + \\omega_{i-1,j} + \\omega_{i,j+1} + \\omega_{i,j-1} - 4\\omega_{i,j}}{h^2}}.$$\n",
+    "\n",
+    "#### What do we need to implement?\n",
+    "\n",
+    "Two Warp functions:\n",
+    "1. `advection(...)` - Warp function that computes the advection term at a grid point given the flow information on the left, right, top, and bottom neighbors.\n",
+    "2. `diffusion(...)` - Warp function that computes the Laplacian at a grid point given the flow information on the left, right, top, and bottom neighbors."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### EXERCISE: Fill in the MISSING parts in `advection(...)` and `diffusion(...)` using the finite difference formulas described above.\n",
+    "\n",
+    "NOTE: Please ignore the inner workings of `def compute_advection_diffusion_kernel(...)` for now. For now, treat it as the black box to validate the accuracy of the Warp functions that will write."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@wp.func\n",
+    "def cyclic_index(idx: wp.int32, n: wp.int32) -> wp.int32:\n",
+    "    \"\"\"Map any index to [0, n-1] for periodic boundary conditions.\n",
+    "\n",
+    "    Args:\n",
+    "        idx: Input index that may be outside the valid range.\n",
+    "        n: Grid size defining the periodic domain.\n",
+    "\n",
+    "    Returns:\n",
+    "        Index wrapped to the range [0, n-1].\n",
+    "    \"\"\"\n",
+    "    ret_idx = idx % n\n",
+    "    if ret_idx < 0:\n",
+    "        ret_idx += n\n",
+    "    return ret_idx\n",
+    "\n",
+    "\n",
+    "@wp.func\n",
+    "def advection(\n",
+    "    omega_left: wp.float32,\n",
+    "    omega_right: wp.float32,\n",
+    "    omega_top: wp.float32,\n",
+    "    omega_down: wp.float32,\n",
+    "    psi_left: wp.float32,\n",
+    "    psi_right: wp.float32,\n",
+    "    psi_top: wp.float32,\n",
+    "    psi_down: wp.float32,\n",
+    "    h: wp.float32,\n",
+    ") -> wp.float32:\n",
+    "    \"\"\"Calculate the advection term using central finite difference.\n",
+    "\n",
+    "    Args:\n",
+    "        omega_left: Vorticity at (i-1, j).\n",
+    "        omega_right: Vorticity at (i+1, j).\n",
+    "        omega_top: Vorticity at (i, j+1).\n",
+    "        omega_down: Vorticity at (i, j-1).\n",
+    "        psi_left: Stream function at (i-1, j).\n",
+    "        psi_right: Stream function at (i+1, j).\n",
+    "        psi_top: Stream function at (i, j+1).\n",
+    "        psi_down: Stream function at (i, j-1).\n",
+    "        h: Grid spacing.\n",
+    "\n",
+    "    Returns:\n",
+    "        Advection term value at grid point (i, j).\n",
+    "    \"\"\"\n",
+    "    inv_2h = 1.0 / (2.0 * h)\n",
+    "    term_1 = ((omega_right - omega_left) * inv_2h) * (\n",
+    "        (psi_top - psi_down) * inv_2h\n",
+    "    )  # MISSING\n",
+    "    term_2 = ((omega_top - omega_down) * inv_2h) * (\n",
+    "        (psi_right - psi_left) * inv_2h\n",
+    "    )  # MISSING\n",
+    "    return term_1 - term_2\n",
+    "\n",
+    "\n",
+    "@wp.func\n",
+    "def diffusion(\n",
+    "    omega_left: wp.float32,\n",
+    "    omega_right: wp.float32,\n",
+    "    omega_center: wp.float32,\n",
+    "    omega_down: wp.float32,\n",
+    "    omega_top: wp.float32,\n",
+    "    h: wp.float32,\n",
+    ") -> wp.float32:\n",
+    "    \"\"\"Calculate the Laplacian for viscous diffusion using central difference.\n",
+    "\n",
+    "    Args:\n",
+    "        omega_left: Vorticity at (i-1, j).\n",
+    "        omega_right: Vorticity at (i+1, j).\n",
+    "        omega_center: Vorticity at (i, j).\n",
+    "        omega_down: Vorticity at (i, j-1).\n",
+    "        omega_top: Vorticity at (i, j+1).\n",
+    "        h: Grid spacing.\n",
+    "\n",
+    "    Returns:\n",
+    "        Laplacian of vorticity at grid point (i, j).\n",
+    "    \"\"\"\n",
+    "    inv_h2 = 1.0 / (h * h)\n",
+    "    # Combine both the diffusion terms in the x and y direction together\n",
+    "    laplacian = (\n",
+    "        omega_left + omega_right + omega_top + omega_down - 4.0 * omega_center\n",
+    "    ) * inv_h2  # MISSING\n",
+    "    return laplacian\n",
+    "\n",
+    "\n",
+    "# --- Validation kernel for advection and diffusion ---\n",
+    "@wp.kernel\n",
+    "def compute_advection_diffusion_kernel(\n",
+    "    omega: wp.array2d(dtype=wp.float32),\n",
+    "    psi: wp.array2d(dtype=wp.float32),\n",
+    "    advection_out: wp.array2d(dtype=wp.float32),\n",
+    "    diffusion_out: wp.array2d(dtype=wp.float32),\n",
+    "    h: wp.float32,\n",
+    "    n: wp.int32,\n",
+    "):\n",
+    "    \"\"\"Compute advection and diffusion terms at each grid point.\"\"\"\n",
+    "    i, j = wp.tid()\n",
+    "\n",
+    "    # obtain the neighboring indices for the [i, j]th cell in a periodic square box\n",
+    "    i_left = cyclic_index(i - 1, n)\n",
+    "    i_right = cyclic_index(i + 1, n)\n",
+    "    j_down = cyclic_index(j - 1, n)\n",
+    "    j_top = cyclic_index(j + 1, n)\n",
+    "\n",
+    "    # gather omega values at neighbors\n",
+    "    omega_center = omega[i, j]\n",
+    "    omega_left = omega[i_left, j]\n",
+    "    omega_right = omega[i_right, j]\n",
+    "    omega_down = omega[i, j_down]\n",
+    "    omega_top = omega[i, j_top]\n",
+    "\n",
+    "    # gather psi values at neighbors\n",
+    "    psi_left = psi[i_left, j]\n",
+    "    psi_right = psi[i_right, j]\n",
+    "    psi_down = psi[i, j_down]\n",
+    "    psi_top = psi[i, j_top]\n",
+    "\n",
+    "    # compute diffusion term (Laplacian)\n",
+    "    diffusion_out[i, j] = diffusion(\n",
+    "        omega_left,\n",
+    "        omega_right,\n",
+    "        omega_center,\n",
+    "        omega_down,\n",
+    "        omega_top,\n",
+    "        h,\n",
+    "    )\n",
+    "\n",
+    "    # compute advection term J(psi, omega)\n",
+    "    advection_out[i, j] = advection(\n",
+    "        omega_left,\n",
+    "        omega_right,\n",
+    "        omega_top,\n",
+    "        omega_down,\n",
+    "        psi_left,\n",
+    "        psi_right,\n",
+    "        psi_top,\n",
+    "        psi_down,\n",
+    "        h,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### EXERCISE: Run the code below, analyze the results. You can also try some other combinations of `n_grid`, `kx`, `ky`.\n",
+    "\n",
+    "Here we are testing the accuracy of our `advection(...)` and `diffusion(...)` Warp functions using 2-D functions with known analytical solutions. This allows us to compare our numerical approximations against the exact results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run validation with manufactured solution\n",
+    "from utils import validate_advection_diffusion\n",
+    "\n",
+    "(fig_diff, _), (fig_adv, _) = validate_advection_diffusion(\n",
+    "    compute_advection_diffusion_kernel, n_grid=512, kx=1, ky=1\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Building block 1.2: Time March\n",
+    "\n",
+    "A simple way to handle the time evolution of $\\omega$ would be to do a *first-order* approximation below (also called **forward Euler scheme**). The terms in $\\color{green}{\\text{green}}$ are the ones for which we have already written Warp functions in the previous cells.\n",
+    "\n",
+    "$$\\frac{\\partial \\omega}{\\partial t} \\approx \\frac{\\omega^{(n+1)} - \\omega^{(n)}}{\\Delta t} = \\color{green}\\frac{1}{\\text{Re}}\\text{Diffusion}_{i,j} - \\text{Advection}_{i,j}.$$\n",
+    "\n",
+    "Rearranging:\n",
+    "\n",
+    "$$\\omega^{(n+1)} = \\omega^{(n)} + \\Delta t \\left( \\frac{1}{\\text{Re}}\\text{Diffusion}_{i,j} - \\text{Advection}_{i,j} \\right).$$\n",
+    "\n",
+    "where $\\mathcal{L}(\\omega) = \\frac{1}{\\text{Re}}\\text{Diffusion} - \\text{Advection}$ is the RHS term we just discretized above.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Explicit time-integration and SIMT schemes\n",
+    "\n",
+    "The forward Euler method for approximating $\\partial \\omega/\\partial t$ belongs to a general class of **explicit time-integration schemes**, wherein the solution at timestep $n+1$ depends solely on quantities evaluated at timestep $n$. This property makes explicit schemes particularly well-suited for the **Single Instruction, Multiple Threads (SIMT)** execution model. \n",
+    "\n",
+    "With the entire field from the previous timestep residing in GPU memory, each thread can independently read the required stencil values and compute the updated field value at its assigned grid point\u2014enabling simultaneous updates across all $N \\times N$ grid points. \n",
+    "\n",
+    "The figure below illustrates the 5-point stencil pattern: to compute the updated value at cell $(i,j)$, we access its four cardinal neighbors and the center point\u2014all from timestep $n$. The result is written to a separate output array representing timestep $n+1$.\n",
+    "\n",
+    "<p align=\"center\">\n",
+    "  <img src=\"images/chapter-12.2/stencil.png\" alt=\"5-point stencil for advection and diffusion\" width=\"700\">\n",
+    "</p>\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### EXERCISE: Given that we have the correct `def advection(...)` and `def diffusion(...)` functions, let's assemble the timestepping kernel together.\n",
+    "\n",
+    "Note below that `omega_0` corresponds to $\\omega^{n}$ and `omega_1` corresponds to $\\omega^{n+1}$."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@wp.kernel\n",
+    "def viscous_advection_kernel(\n",
+    "    n: int,\n",
+    "    h: float,\n",
+    "    re: float,\n",
+    "    dt: float,\n",
+    "    omega_0: wp.array2d(dtype=float),\n",
+    "    omega_1: wp.array2d(dtype=float),\n",
+    "    psi: wp.array2d(dtype=float),\n",
+    "    rhs: wp.array2d(dtype=float),\n",
+    "):\n",
+    "    \"\"\"Compute advection + diffusion and advance vorticity using forward Euler.\n",
+    "\n",
+    "    Args:\n",
+    "        n: Grid size.\n",
+    "        h: Grid spacing.\n",
+    "        re: Reynolds number.\n",
+    "        dt: Timestep size.\n",
+    "        omega_0: Vorticity field at the beginning of the timestep.\n",
+    "        omega_1: Vorticity field at the end of the timestep.\n",
+    "        psi: Stream function field.\n",
+    "        rhs: Temporarily stores diffusion + advection terms.\n",
+    "    \"\"\"\n",
+    "    i, j = wp.tid()\n",
+    "\n",
+    "    # obtain the neighboring indices for the [i, j]th cell in a periodic square box\n",
+    "    left_idx = cyclic_index(i - 1, n)\n",
+    "    right_idx = cyclic_index(i + 1, n)\n",
+    "    top_idx = cyclic_index(j + 1, n)\n",
+    "    down_idx = cyclic_index(j - 1, n)\n",
+    "\n",
+    "    # compute viscous diffusion term\n",
+    "    rhs[i, j] = (1.0 / re) * diffusion(\n",
+    "        omega_1[left_idx, j],\n",
+    "        omega_1[right_idx, j],\n",
+    "        omega_1[i, j],\n",
+    "        omega_1[i, down_idx],\n",
+    "        omega_1[i, top_idx],\n",
+    "        h,\n",
+    "    )\n",
+    "\n",
+    "    # subtract advection term\n",
+    "    rhs[i, j] -= advection(\n",
+    "        omega_1[left_idx, j],\n",
+    "        omega_1[right_idx, j],\n",
+    "        omega_1[i, top_idx],\n",
+    "        omega_1[i, down_idx],\n",
+    "        psi[left_idx, j],\n",
+    "        psi[right_idx, j],\n",
+    "        psi[i, top_idx],\n",
+    "        psi[i, down_idx],\n",
+    "        h,\n",
+    "    )\n",
+    "\n",
+    "    # forward Euler update: omega^(n+1) = omega^n + dt * L(omega^n)\n",
+    "    omega_1[i, j] = omega_0[i, j] + dt * rhs[i, j]  # MISSING"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## Building block 2: Poisson Solver \n",
+    "\n",
+    "<p align=\"center\">\n",
+    "  <img src=\"images/chapter-12.2/solver_pipeline_simple_poisson.svg\" alt=\"Solver pipeline highlighting discretization step\" width=\"800\">\n",
+    "</p>\n",
+    "\n",
+    "Recall from equation (3) in the Background section that the Poisson equation in the Fourier space is\n",
+    "\n",
+    "$$\\hat{\\psi}_{m,n} = \\frac{\\hat{\\omega}_{m,n}}{k_x^2 + k_y^2}, \\tag{3}$$\n",
+    "\n",
+    "that we solve for each $[k_x, k_y]$ pair. To this end, we need to follow the pipeline below.\n",
+    "<p align=\"center\">\n",
+    "  <img src=\"images/chapter-12.2/fourier_solver_pipeline.png\" alt=\"Fourier Poisson Solver Pipeline\" width=\"800\">\n",
+    "</p>\n",
+    "\n",
+    "### Warp's FFT primitives\n",
+    "\n",
+    "Warp provides `wp.tile_fft()` and `wp.tile_ifft()` that operate on `wp.vec2f` (or `wp.vec2d`) arrays and perform row-wise FFT/IFFT, where say if $m = 2 + 3i$, then `m=wp.vec2f(2.0, 3.0)`\n",
+    "- `m.x = 2.0`: **real part**\n",
+    "- `m.y = 3.0`: **imaginary part**\n",
+    "\n",
+    "### What do we need to implement?\n",
+    "\n",
+    "1. **Data type conversion**: Kernels to move data from `wp.float32` ($\\omega$, $\\psi$) to `wp.vec2f` ($\\hat{\\omega}, \\hat{psi}$) and vice-versa.\n",
+    "\n",
+    "2. **Tile-based FFT/IFFT kernels**: Kernels that will use `wp.tile_fft()` and `wp.tile_ifft()` to perform row-wise FFT/IFFT.\n",
+    "\n",
+    "3. **Transpose kernel**: A matrix transpose kernel.\n",
+    "\n",
+    "Since our physical fields $\\omega$, $\\psi$ are stored as `wp.float32`, we need a helper kernel to convert between data types. Broadly, the idea again is to leverage Warp kernels to accelerate our numerical operations of FFT/IFFT.\n",
+    "\n",
+    "Let's build these step by step."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Building Block 2.1: Data type conversion kernels\n",
+    "\n",
+    "Since Warp's FFT operates on `wp.vec2f` (complex) arrays but our physical fields are `wp.float32` (real), we need:\n",
+    "\n",
+    "- `copy_float_to_vec2`: Convert real array to complex with zero imaginary part (before FFT).\n",
+    "- `extract_real_and_scale`: Extract real part and apply scaling by dividing the real part by `scale` (after IFFT)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### EXERCISE: Complete the `def extract_real_and_scale(...)` kernel definition. `def copy_float_to_vec2(...)` is already completed for you for reference."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@wp.kernel\n",
+    "def copy_float_to_vec2(\n",
+    "    omega: wp.array2d(dtype=wp.float32), omega_complex: wp.array2d(dtype=wp.vec2f)\n",
+    "):\n",
+    "    \"\"\"Copy real vorticity to a complex array with zero imaginary part.\n",
+    "\n",
+    "    Args:\n",
+    "        omega: Input real-valued vorticity array.\n",
+    "        omega_complex: Output complex array where real part is omega, imaginary is 0.\n",
+    "    \"\"\"\n",
+    "    i, j = wp.tid()\n",
+    "    omega_complex[i, j] = wp.vec2f(omega[i, j], 0.0)\n",
+    "\n",
+    "\n",
+    "@wp.kernel\n",
+    "def extract_real_and_scale(\n",
+    "    scale: wp.float32,\n",
+    "    complex_array: wp.array2d(dtype=wp.vec2f),\n",
+    "    real_array: wp.array2d(dtype=wp.float32),\n",
+    "):\n",
+    "    \"\"\"Extract real part from complex array and scale in one pass.\n",
+    "\n",
+    "    Args:\n",
+    "        scale: Scale factor to multiply each element by.\n",
+    "        complex_array: Input complex array (vec2f where .x is real part).\n",
+    "        real_array: Output real array (scaled).\n",
+    "    \"\"\"\n",
+    "    i, j = wp.tid()\n",
+    "    # real_array = Re(complex_array) divided by scale\n",
+    "    real_array[i, j] = complex_array[i, j].x / scale  # MISSING"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Building Block 2.2: Tile-based FFT and IFFT kernels (WIP)\n",
+    "\n",
+    "Warp's `wp.tile_fft()` and `wp.tile_ifft()` perform FFT on tiles loaded into registers. For row-wise transforms:\n",
+    "\n",
+    "1. Each thread block loads one row (tile) of the 2D array\n",
+    "2. Performs FFT/IFFT using `wp.tile_fft()` / `wp.tile_ifft()`\n",
+    "3. Stores the result back to global memory\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@wp.kernel\n",
+    "def fft_tiled(x: wp.array2d(dtype=wp.vec2f), y: wp.array2d(dtype=wp.vec2f)):\n",
+    "    \"\"\"Perform 1-D FFT on each row using wp.tile_fft().\n",
+    "\n",
+    "    Args:\n",
+    "        x: Input complex array of shape (N, N).\n",
+    "        y: Output complex array of shape (N, N) storing FFT results.\n",
+    "    \"\"\"\n",
+    "    i, _, _ = wp.tid()\n",
+    "    a = wp.tile_load(x, shape=(1, N_GRID), offset=(i * 1, 0))\n",
+    "    wp.tile_fft(a)\n",
+    "    wp.tile_store(y, a, offset=(i * 1, 0))\n",
+    "\n",
+    "\n",
+    "@wp.kernel\n",
+    "def ifft_tiled(x: wp.array2d(dtype=wp.vec2f), y: wp.array2d(dtype=wp.vec2f)):\n",
+    "    \"\"\"Perform 1-D inverse FFT on each row using wp.tile_ifft().\n",
+    "\n",
+    "    Args:\n",
+    "        x: Input complex array of shape (N, N).\n",
+    "        y: Output complex array of shape (N, N) storing IFFT results.\n",
+    "    \"\"\"\n",
+    "    i, _, _ = wp.tid()\n",
+    "    a = wp.tile_load(x, shape=(1, N_GRID), offset=(i * 1, 0))\n",
+    "    wp.tile_ifft(a)\n",
+    "    wp.tile_store(y, a, offset=(i * 1, 0))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Validating FFT/IFFT kernels \n",
+    "To verify that `fft_tiled` and `ifft_tiled` work correctly, we test them with a known signal $f(x) = \\sin(x)$ on the domain $[0, 2\\pi)$. The Fourier transform of $\\sin(x)$ has a simple analytical form. Since $\\sin(x) = \\frac{e^{ix} - e^{-ix}}{2i}$, the FFT should produce peaks only at wavenumbers $k = \\pm 1$, with zero amplitude elsewhere. \n",
+    "\n",
+    "The `validate_fft_roundtrip(...)` function initializes the signal $f(x)$ on the $[0, 2\\pi)$ domain and performs a full roundtrip, i.e., it applies the FFT to the signal, then applies the IFFT to the result. The function then plots the magnitude spectrum $|\\hat{f}(k)|$ and compares the original signal with the reconstructed signal obtained from the FFT $\\rightarrow$ IFFT roundtrip."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Validate FFT kernels with a simple sine wave test\n",
+    "from utils import validate_fft_roundtrip\n",
+    "\n",
+    "fig, axes, max_error = validate_fft_roundtrip(\n",
+    "    fft_kernel=fft_tiled,\n",
+    "    ifft_kernel=ifft_tiled,\n",
+    "    n_grid=N_GRID,\n",
+    "    tile_m=1,\n",
+    "    tile_n=N_GRID,\n",
+    "    block_dim=N_GRID // 2,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Building Block 2.3:  Transpose kernel\n",
+    "\n",
+    "To compute 2D FFT using 1D row transforms, we need a transpose operation between passes:\n",
+    "\n",
+    "$$\\text{2D FFT} = \\text{row FFT} \\rightarrow \\text{transpose} \\rightarrow \\text{row FFT}.$$\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Exercise: Fill in the missing `def transpose(...)` kernel."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@wp.kernel\n",
+    "def transpose(x: wp.array2d(dtype=wp.vec2f), y: wp.array2d(dtype=wp.vec2f)):\n",
+    "    \"\"\"Transpose a 2-D array.\n",
+    "\n",
+    "    Args:\n",
+    "        x: Input complex array.\n",
+    "        y: Output complex array storing the transpose of x.\n",
+    "    \"\"\"\n",
+    "    i, j = wp.tid()\n",
+    "    y[i, j] = x[j, i]  # MISSING"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Validating transpose kernel\n",
+    "\n",
+    "We validate this using an upper triangular matrix, which should become lower triangular after transpose. For example, in case of a $4 \\times 4$ upper triangular matrix:\n",
+    "\n",
+    "$$\n",
+    "\\begin{bmatrix} 1 & 1 & 1 & 1 \\\\ 0 & 1 & 1 & 1 \\\\ 0 & 0 & 1 & 1 \\\\ 0 & 0 & 0 & 1 \\end{bmatrix}\n",
+    "\\xrightarrow{\\text{transpose}}\n",
+    "\\begin{bmatrix} 1 & 0 & 0 & 0 \\\\ 1 & 1 & 0 & 0 \\\\ 1 & 1 & 1 & 0 \\\\ 1 & 1 & 1 & 1 \\end{bmatrix}.\n",
+    "$$"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Exercise: Run the cell below and feel free to change `n_test` to confirm that your transpose kernel is doing what's intended."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from utils import validate_transpose\n",
+    "\n",
+    "fig, axes = validate_transpose(transpose_kernel=transpose, n_test=64)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Going from $\\hat{\\omega}$ to $\\hat{\\psi}$ in the Fourier space\n",
+    "\n",
+    "After transforming $\\omega$ to Fourier space, we solve equation (3)\n",
+    "\n",
+    "$$\\hat{\\psi}_{m,n} = \\frac{\\hat{\\omega}_{m,n}}{k_x^2 + k_y^2} = \\frac{\\hat{\\omega}_{m,n}}{|k|^2} = \\hat{\\omega}_{m,n} \\cdot |k|^{-2}.$$\n",
+    "\n",
+    "We use the precomputed $1/|k|^2$ in the beginning of the lab to multiply to the `omega_hat` array to obtain the `psi_hat` array.\n",
+    " \n",
+    "We have already filled up the kernel for you."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@wp.kernel\n",
+    "def multiply_k2_inverse(\n",
+    "    k2i: wp.array2d(dtype=wp.float32),\n",
+    "    omega_hat: wp.array2d(dtype=wp.vec2f),\n",
+    "    psi_hat: wp.array2d(dtype=wp.vec2f),\n",
+    "):\n",
+    "    \"\"\"Solve Poisson equation in Fourier space: psi_hat = omega_hat / |k|^2.\n",
+    "\n",
+    "    Args:\n",
+    "        k2i: Precomputed 1/|k|^2 array (0 at k=0).\n",
+    "        omega_hat: Fourier transform of vorticity.\n",
+    "        psi_hat: Output Fourier transform of stream function.\n",
+    "    \"\"\"\n",
+    "    i, j = wp.tid()\n",
+    "    psi_hat[i, j] = omega_hat[i, j] * k2i[i, j]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Assembling all the components together to build our one solver loop \n",
+    "\n",
+    "<p align=\"center\">\n",
+    "  <img src=\"images/chapter-12.2/solver_pipeline_simple.svg\" alt=\"Solver Pipeline\" width=\"800\">\n",
+    "</p>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that we have all the kernel building blocks in place, we'll assemble them into the complete `step()` function below. One call to `step()` basically captures one solver loop. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def step(omega_0, omega_1, psi, rhs, k2i):\n",
+    "    \"\"\"Advance simulation by one timestep using forward Euler.\n",
+    "\n",
+    "    Full pipeline: advection/diffusion + Poisson solve via FFT.\n",
+    "\n",
+    "    Args:\n",
+    "        omega_0: Vorticity at the start of timestep on the 2-D grid.\n",
+    "        omega_1: Vorticity at the end of timestep on the 2-D grid.\n",
+    "        psi: Stream function on the 2-D grid.\n",
+    "        rhs: Temporary array for advection + diffusion terms.\n",
+    "        k2i: Precomputed 1/|k|^2 for Poisson solver.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    # Allocate temporary buffers for FFT operations\n",
+    "    omega_complex = wp.zeros((N_GRID, N_GRID), dtype=wp.vec2f)\n",
+    "    fft_temp_1 = wp.zeros((N_GRID, N_GRID), dtype=wp.vec2f)\n",
+    "    fft_temp_2 = wp.zeros((N_GRID, N_GRID), dtype=wp.vec2f)\n",
+    "    fft_temp_3 = wp.zeros((N_GRID, N_GRID), dtype=wp.vec2f)\n",
+    "\n",
+    "    # ===== Fourier Poisson solver =====\n",
+    "\n",
+    "    # Convert omega_1 (real) to wp.vec2f() data type\n",
+    "    wp.launch(\n",
+    "        copy_float_to_vec2,\n",
+    "        dim=(N_GRID, N_GRID),\n",
+    "        inputs=[omega_1],\n",
+    "        outputs=[omega_complex],\n",
+    "    )\n",
+    "\n",
+    "    # Forward 2D FFT: row FFT -> transpose -> row FFT\n",
+    "    wp.launch_tiled(\n",
+    "        fft_tiled,\n",
+    "        dim=[N_GRID, 1],\n",
+    "        inputs=[omega_complex],\n",
+    "        outputs=[fft_temp_1],\n",
+    "        block_dim=N_GRID // 2,\n",
+    "    )\n",
+    "    wp.launch(\n",
+    "        transpose, dim=[N_GRID, N_GRID], inputs=[fft_temp_1], outputs=[fft_temp_2]\n",
+    "    )\n",
+    "    wp.launch_tiled(\n",
+    "        fft_tiled,\n",
+    "        dim=[N_GRID, 1],\n",
+    "        inputs=[fft_temp_2],\n",
+    "        outputs=[fft_temp_3],\n",
+    "        block_dim=N_GRID // 2,\n",
+    "    )\n",
+    "\n",
+    "    # Solve in Fourier space: psi_hat = omega_hat / |k|^2\n",
+    "    wp.launch(\n",
+    "        multiply_k2_inverse,\n",
+    "        dim=(N_GRID, N_GRID),\n",
+    "        inputs=[k2i, fft_temp_3],\n",
+    "        outputs=[fft_temp_1],\n",
+    "    )\n",
+    "\n",
+    "    # Inverse 2D FFT: row IFFT -> transpose -> row IFFT\n",
+    "    wp.launch_tiled(\n",
+    "        ifft_tiled,\n",
+    "        dim=[N_GRID, 1],\n",
+    "        inputs=[fft_temp_1],\n",
+    "        outputs=[fft_temp_2],\n",
+    "        block_dim=N_GRID // 2,\n",
+    "    )\n",
+    "    wp.launch(\n",
+    "        transpose, dim=[N_GRID, N_GRID], inputs=[fft_temp_2], outputs=[fft_temp_3]\n",
+    "    )\n",
+    "    wp.launch_tiled(\n",
+    "        ifft_tiled,\n",
+    "        dim=[N_GRID, 1],\n",
+    "        inputs=[fft_temp_3],\n",
+    "        outputs=[fft_temp_1],\n",
+    "        block_dim=N_GRID // 2,\n",
+    "    )\n",
+    "    wp.launch(\n",
+    "        extract_real_and_scale,\n",
+    "        dim=(N_GRID, N_GRID),\n",
+    "        inputs=[wp.float32(N_GRID * N_GRID), fft_temp_1],\n",
+    "        outputs=[psi],\n",
+    "    )\n",
+    "\n",
+    "    rhs.zero_()\n",
+    "\n",
+    "    # ===== Discretization + Time Marching =====\n",
+    "    wp.launch(\n",
+    "        viscous_advection_kernel,\n",
+    "        dim=(N_GRID, N_GRID),\n",
+    "        inputs=[N_GRID, H, RE, DT, omega_0, omega_1, psi, rhs],\n",
+    "    )\n",
+    "\n",
+    "    # Copy omega_1 back to omega_0 for next timestep\n",
+    "    wp.copy(omega_0, omega_1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### CUDA graph capture\n",
+    "\n",
+    "CUDA graphs let you define a sequence of operations (kernel launches, data movement) and their dependencies once, then replay it repeatedly with minimal overhead. Normally, each kernel launch incurs CPU-side setup costs. For short-running kernels launched many times, this overhead dominates. By capturing the entire workflow in a graph, these setup costs are paid once during graph creation, and subsequent launches are nearly free.\n",
+    "\n",
+    "In Warp, you can capture a CUDA graph with `wp.ScopedCapture()`. In the code block below, a single call to `step(...)` is captured and will be replayed for as many times as we want to run the simulation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# capture first step in a CUDA graph\n",
+    "with wp.ScopedCapture() as capture:\n",
+    "    step(omega_0, omega_1, psi, rhs, k2i)\n",
+    "step_graph = capture.graph"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Benchmarking CUDA Graph WIP"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Running the simulation and saving the results\n",
+    "\n",
+    "The code below runs the simulation for 2000 timesteps and saves the vorticity field at every 10th timestep. The vorticity fields are then plotted and saved as a GIF."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "from PIL import Image\n",
+    "import matplotlib.pyplot as plt\n",
+    "from matplotlib.colors import Normalize\n",
+    "import os\n",
+    "\n",
+    "NUM_FRAMES = 2000\n",
+    "STEPS_PER_FRAME = 10\n",
+    "\n",
+    "# Colormap setup\n",
+    "cmap = plt.cm.twilight\n",
+    "norm = Normalize(vmin=-15, vmax=15)\n",
+    "\n",
+    "frames = []\n",
+    "print(f\"Running {NUM_FRAMES} frames ({STEPS_PER_FRAME} steps each)...\")\n",
+    "\n",
+    "start_time = time.perf_counter()\n",
+    "for frame in range(NUM_FRAMES):\n",
+    "    # Advance simulation\n",
+    "    for _ in range(STEPS_PER_FRAME):\n",
+    "        wp.capture_launch(step_graph)\n",
+    "\n",
+    "    # Capture frame\n",
+    "    vorticity = omega_1.numpy().T  # Transpose for correct orientation\n",
+    "    colored = cmap(norm(vorticity))\n",
+    "    rgb = (colored[:, :, :3] * 255).astype(np.uint8)\n",
+    "    frames.append(rgb)\n",
+    "\n",
+    "    if (frame + 1) % 50 == 0:\n",
+    "        print(f\"  Frame {frame + 1}/{NUM_FRAMES}\")\n",
+    "\n",
+    "elapsed = time.perf_counter() - start_time\n",
+    "total_steps = NUM_FRAMES * STEPS_PER_FRAME\n",
+    "print(\n",
+    "    f\"Completed {total_steps} steps in {elapsed:.2f}s ({total_steps / elapsed:.0f} steps/s)\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import IPython.display\n",
+    "\n",
+    "# Create output directory\n",
+    "os.makedirs(\"./images/chapter-12.2\", exist_ok=True)\n",
+    "output_file = f\"./images/chapter-12.2/turbulence_{N_GRID}x{N_GRID}_Re{int(RE)}.gif\"\n",
+    "\n",
+    "# Save as GIF (resize to reduce file size)\n",
+    "GIF_SIZE = 256  # Target resolution for GIF\n",
+    "pil_images = [\n",
+    "    Image.fromarray(frame).resize((GIF_SIZE, GIF_SIZE), Image.LANCZOS)\n",
+    "    for frame in frames\n",
+    "]\n",
+    "pil_images[0].save(\n",
+    "    output_file,\n",
+    "    save_all=True,\n",
+    "    append_images=pil_images[1:],\n",
+    "    duration=50,\n",
+    "    loop=0,\n",
+    ")\n",
+    "\n",
+    "print(f\"Saved: {output_file}\")\n",
+    "IPython.display.Image(output_file)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## Conclusion WIP"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "---\n",
+    "## References WIP"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Appendix A\n",
+    "\n",
+    "### 1. The governing equations\n",
+    "In 2-D incompressible flow (density constant), the Navier-Stokes equations consist of three equations:\n",
+    " \n",
+    "Momentum in $x$\n",
+    "$$\\frac{\\partial u}{\\partial t} + u\\frac{\\partial u}{\\partial x} + v\\frac{\\partial u}{\\partial y} = -\\frac{\\partial p}{\\partial x} + \\frac{1}{Re}\\left(\\frac{\\partial^2 u}{\\partial x^2} + \\frac{\\partial^2 u}{\\partial y^2}\\right),$$\n",
+    " \n",
+    "Momentum in $y$\n",
+    "$$\\frac{\\partial v}{\\partial t} + u\\frac{\\partial v}{\\partial x} + v\\frac{\\partial v}{\\partial y} = -\\frac{\\partial p}{\\partial y} + \\frac{1}{Re}\\left(\\frac{\\partial^2 v}{\\partial x^2} + \\frac{\\partial^2 v}{\\partial y^2}\\right),$$\n",
+    " \n",
+    "Continuity\n",
+    "$$\\frac{\\partial u}{\\partial x} + \\frac{\\partial v}{\\partial y} = 0.$$\n",
+    " \n",
+    "**This means solving for three unknowns: velocity components $(u, v)$ and pressure $p$ and three equations.** The Reynolds number $Re$ is a dimensionless parameter that dictates the large-scale behavior of the fluid in the box. \n",
+    "\n",
+    "But there's a cleaner approach using two scalar fields defined in terms of $(u, v)$, namely \n",
+    "\n",
+    "- **Vorticity** $\\omega$: Measures local rotation of the fluid. In 2-D, only one component is non-zero, which we will call $\\omega = \\frac{\\partial v}{\\partial x} - \\frac{\\partial u}{\\partial y}$ in subsequent sections.\n",
+    " \n",
+    "$$\\boldsymbol{\\omega} = \\nabla \\times \\mathbf{u} = \\begin{vmatrix} \\mathbf{i} & \\mathbf{j} & \\mathbf{k} \\\\ \\frac{\\partial}{\\partial x} & \\frac{\\partial}{\\partial y} & \\frac{\\partial}{\\partial z}=0 \\\\ u & v & w=0 \\end{vmatrix} = \\begin{pmatrix} \\frac{\\partial v}{\\partial x} - \\frac{\\partial u}{\\partial y} \\\\ 0 \\\\ 0 \\end{pmatrix}.$$\n",
+    "  \n",
+    "- **Stream function** $\\psi$: Defines velocity through its derivatives as follows. This formulation ensures that the continuity equation is automatically satisfied.\n",
+    "$$u = \\frac{\\partial \\psi}{\\partial y}, v = -\\frac{\\partial \\psi}{\\partial x}.$$\n",
+    "\n",
+    "### 2. The boundary conditions\n",
+    "We choose to simulate the above equations in a square box with **periodic boundary conditions**. In simpler terms, *what exits from one side enters from the other*\n",
+    "\n",
+    "To illustrate this further, we tile the original $N \\times N$ simulation side by side to form a $2N \\times 2N$ GIF. Notice how structures repeat across tiles and vortices maintain continuity (and differentiability!) at the edges.\n",
+    " \n",
+    "![4pi_simulation](images/chapter-12.2/periodicity_ns_tiled.gif)\n",
+    "\n",
+    "Which mathematical functions have the property of repeating after regular intervals and maintaining continuity/differentiability at the edges?\n",
+    "\n",
+    "**Sines and cosines!**: These functions are the building blocks of periodic functions. A function $f(x)$ with period $L$ satisfies $f(x + L) = f(x)$, exactly like $\\sin(x)$ and $\\cos(x)$ with period $2\\pi$.\n",
+    "\n",
+    "<img src=\"images/chapter-12.2/periodicity_sin_cos.png\" alt=\"Periodicity of sin and cos\" width=\"60%\">\n",
+    "\n",
+    "**Periodic boundary conditions** and **Fourier methods** go hand in hand. Any periodic function can be decomposed into a sum of sines and cosines (a Fourier series), and operations like derivatives become simple multiplications in Fourier space. We'll use this when solving the Poisson equation. When $\\omega$ and $\\psi$ are represented using Fourier series as\n",
+    " \n",
+    "$$\\omega(x,y) = \\sum_{m,n} \\hat{\\omega}_{m,n} e^{i(k_x x + k_y y)}, \\quad \\psi(x,y) = \\sum_{m,n} \\hat{\\psi}_{m,n} e^{i(k_x x + k_y y)},$$ \n",
+    "\n",
+    "the Poisson equation transforms as \n",
+    "\n",
+    "$$\\nabla^2 \\psi = -\\omega \\quad \\xrightarrow{\\text{Fourier}} \\quad -(k_x^2 + k_y^2)\\hat{\\psi} = -\\hat{\\omega} \\quad \\Rightarrow \\quad \\hat{\\psi} = \\frac{\\hat{\\omega}}{k_x^2 + k_y^2}. \\tag{3}$$\n",
+    "\n",
+    "Thus, we have **transformed a PDE in physical space to a simple algebraic equation in Fourier space**. This would allow us to leverage Warp's FFT functionalities to solve our Equation (2) above.\n",
+    "\n",
+    "Following common convention in academic literature, **we set our simulation domain to $2\\pi \\times 2\\pi$**. This choice naturally simplifies Fourier transform operations, as the wavenumbers become integers $(k_x, k_y = 0, \\pm 1, \\pm 2, \\ldots)$.\n",
+    "\n",
+    "### 3. The initial conditions\n",
+    "\n",
+    "For the initial conditions, we seed the domain with an array of swirling vortices. The derivation of this particular initialization is beyond the scope of this lab; interested readers can refer to San and Staples (2012), *Computers & Fluids*, for further details.\n",
+    "\n",
+    "<img src=\"images/chapter-12.2/initial_vorticity.png\" alt=\"Initial vorticity field\" width=\"40%\">"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "warp-env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file
diff --git a/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/building_block_1_flow.svg b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/building_block_1_flow.svg
new file mode 100644
index 00000000..28dafc12
--- /dev/null
+++ b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/building_block_1_flow.svg
@@ -0,0 +1,120 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg width="900" height="280" xmlns="http://www.w3.org/2000/svg">
+  <defs>
+    <style>
+      .field-box { fill: #FFF9C4; stroke: #1A237E; stroke-width: 2.5; }
+      .compute-box { fill: #E3F2FD; stroke: #1A237E; stroke-width: 2; }
+      .output-box { fill: #E8F5E9; stroke: #1A237E; stroke-width: 2.5; }
+      .text-main { font-family: Arial, sans-serif; font-size: 16px; font-weight: bold; fill: #1A237E; text-anchor: middle; }
+      .text-large { font-family: Arial, sans-serif; font-size: 18px; font-weight: bold; fill: #1A237E; text-anchor: middle; }
+      .text-small { font-family: Arial, sans-serif; font-size: 12px; fill: #1A237E; text-anchor: middle; }
+      .text-label { font-family: Arial, sans-serif; font-size: 14px; font-style: italic; fill: #1976D2; text-anchor: middle; }
+      .arrow { stroke: #1976D2; stroke-width: 3; fill: none; stroke-linecap: round; stroke-linejoin: round; marker-end: url(#arrowhead); }
+      .grid-line { stroke: #90A4AE; stroke-width: 1; }
+    </style>
+    <marker id="arrowhead" markerWidth="8" markerHeight="8" refX="6" refY="3" orient="auto" markerUnits="strokeWidth">
+      <path d="M 0 0 L 7 3 L 0 6 Q 2 3 0 0" fill="#1976D2" />
+    </marker>
+  </defs>
+
+  <!-- Background -->
+  <rect width="900" height="280" fill="white"/>
+
+  <!-- ===== INPUT: omega(t), psi(t) on grid ===== -->
+  <rect x="30" y="80" width="140" height="120" rx="8" class="field-box"/>
+  
+  <!-- Small grid inside input box (4x4) -->
+  <g transform="translate(50, 100)">
+    <rect x="0" y="0" width="60" height="60" fill="#FFECB3" stroke="#1A237E" stroke-width="1"/>
+    <line x1="15" y1="0" x2="15" y2="60" class="grid-line"/>
+    <line x1="30" y1="0" x2="30" y2="60" class="grid-line"/>
+    <line x1="45" y1="0" x2="45" y2="60" class="grid-line"/>
+    <line x1="0" y1="15" x2="60" y2="15" class="grid-line"/>
+    <line x1="0" y1="30" x2="60" y2="30" class="grid-line"/>
+    <line x1="0" y1="45" x2="60" y2="45" class="grid-line"/>
+    <!-- Dots inside grid cells -->
+    <circle cx="7.5" cy="7.5" r="2" fill="#1A237E"/>
+    <circle cx="22.5" cy="22.5" r="2" fill="#1A237E"/>
+    <circle cx="37.5" cy="37.5" r="2" fill="#1A237E"/>
+    <circle cx="52.5" cy="52.5" r="2" fill="#1A237E"/>
+  </g>
+  
+  <text x="100" y="180" class="text-large">&#969;(t), &#968;(t)</text>
+  <text x="100" y="65" class="text-label">Input</text>
+
+  <!-- Arrow 1 -->
+  <path d="M 170 140 L 220 140" class="arrow"/>
+
+  <!-- ===== COMPUTE: Advection + Diffusion ===== -->
+  <rect x="220" y="70" width="200" height="140" rx="8" class="compute-box"/>
+  
+  <!-- Two small grids for Advection and Diffusion -->
+  <g transform="translate(240, 90)">
+    <rect x="0" y="0" width="50" height="50" fill="#BBDEFB" stroke="#1A237E" stroke-width="1"/>
+    <line x1="12.5" y1="0" x2="12.5" y2="50" class="grid-line"/>
+    <line x1="25" y1="0" x2="25" y2="50" class="grid-line"/>
+    <line x1="37.5" y1="0" x2="37.5" y2="50" class="grid-line"/>
+    <line x1="0" y1="12.5" x2="50" y2="12.5" class="grid-line"/>
+    <line x1="0" y1="25" x2="50" y2="25" class="grid-line"/>
+    <line x1="0" y1="37.5" x2="50" y2="37.5" class="grid-line"/>
+    <!-- Dots inside grid cells -->
+    <circle cx="6.25" cy="6.25" r="1.5" fill="#1A237E"/>
+    <circle cx="18.75" cy="18.75" r="1.5" fill="#1A237E"/>
+    <circle cx="31.25" cy="31.25" r="1.5" fill="#1A237E"/>
+    <circle cx="43.75" cy="43.75" r="1.5" fill="#1A237E"/>
+  </g>
+  <text x="265" y="160" class="text-small">Advection</text>
+  
+  <g transform="translate(350, 90)">
+    <rect x="0" y="0" width="50" height="50" fill="#BBDEFB" stroke="#1A237E" stroke-width="1"/>
+    <line x1="12.5" y1="0" x2="12.5" y2="50" class="grid-line"/>
+    <line x1="25" y1="0" x2="25" y2="50" class="grid-line"/>
+    <line x1="37.5" y1="0" x2="37.5" y2="50" class="grid-line"/>
+    <line x1="0" y1="12.5" x2="50" y2="12.5" class="grid-line"/>
+    <line x1="0" y1="25" x2="50" y2="25" class="grid-line"/>
+    <line x1="0" y1="37.5" x2="50" y2="37.5" class="grid-line"/>
+    <!-- Dots inside grid cells -->
+    <circle cx="6.25" cy="6.25" r="1.5" fill="#1A237E"/>
+    <circle cx="18.75" cy="18.75" r="1.5" fill="#1A237E"/>
+    <circle cx="31.25" cy="31.25" r="1.5" fill="#1A237E"/>
+    <circle cx="43.75" cy="43.75" r="1.5" fill="#1A237E"/>
+  </g>
+  <text x="375" y="160" class="text-small">Diffusion</text>
+  
+  <text x="320" y="190" class="text-main">Finite Differences</text>
+  <text x="320" y="55" class="text-label">Compute RHS</text>
+
+  <!-- Arrow 2 -->
+  <path d="M 420 140 L 470 140" class="arrow"/>
+
+  <!-- ===== TIME MARCH: Forward Euler ===== -->
+  <rect x="470" y="90" width="160" height="100" rx="8" class="compute-box"/>
+  <text x="550" y="130" class="text-main">&#969; + &#916;t&#183;RHS</text>
+  <text x="550" y="155" class="text-small">(Forward Euler)</text>
+  <text x="550" y="75" class="text-label">Time March</text>
+
+  <!-- Arrow 3 -->
+  <path d="M 630 140 L 680 140" class="arrow"/>
+
+  <!-- ===== OUTPUT: omega(t+dt) on grid ===== -->
+  <rect x="680" y="80" width="140" height="120" rx="8" class="output-box"/>
+  
+  <!-- Small grid inside output box -->
+  <g transform="translate(700, 100)">
+    <rect x="0" y="0" width="60" height="60" fill="#C8E6C9" stroke="#1A237E" stroke-width="1"/>
+    <line x1="15" y1="0" x2="15" y2="60" class="grid-line"/>
+    <line x1="30" y1="0" x2="30" y2="60" class="grid-line"/>
+    <line x1="45" y1="0" x2="45" y2="60" class="grid-line"/>
+    <line x1="0" y1="15" x2="60" y2="15" class="grid-line"/>
+    <line x1="0" y1="30" x2="60" y2="30" class="grid-line"/>
+    <line x1="0" y1="45" x2="60" y2="45" class="grid-line"/>
+    <!-- Dots inside grid cells -->
+    <circle cx="7.5" cy="7.5" r="2" fill="#1A237E"/>
+    <circle cx="22.5" cy="22.5" r="2" fill="#1A237E"/>
+    <circle cx="37.5" cy="37.5" r="2" fill="#1A237E"/>
+    <circle cx="52.5" cy="52.5" r="2" fill="#1A237E"/>
+  </g>
+  
+  <text x="750" y="180" class="text-large">&#969;(t+&#916;t)</text>
+  <text x="750" y="65" class="text-label">Output</text>
+</svg>
diff --git a/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/fourier_solver_pipeline.png b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/fourier_solver_pipeline.png
new file mode 100644
index 00000000..0eeaecf0
--- /dev/null
+++ b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/fourier_solver_pipeline.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e92577bc035520aa2270a78b6d9231e7490b34e639c115288ecf157975e9df3
+size 71788
diff --git a/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/initial_vorticity.png b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/initial_vorticity.png
new file mode 100644
index 00000000..229bb84f
--- /dev/null
+++ b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/initial_vorticity.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80c96b1f7b2b8c6cb9eb2e32e6fe6e39d7e31ab13c125df3204ef80cf721ad8c
+size 943877
diff --git a/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/navier_stokes_timestep.svg b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/navier_stokes_timestep.svg
new file mode 100644
index 00000000..977b6ee0
--- /dev/null
+++ b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/navier_stokes_timestep.svg
@@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg width="650" height="320" xmlns="http://www.w3.org/2000/svg">
+  <defs>
+    <style>
+      .box { fill: #CBD5E8; stroke: #1A4A6E; stroke-width: 2; }
+      .text-math { font-family: "Times New Roman", Times, serif; font-size: 24px; font-style: italic; fill: #1A4A6E; text-anchor: middle; }
+      .text-label { font-family: "Times New Roman", Times, serif; font-size: 18px; fill: #1A4A6E; text-anchor: middle; }
+      .text-loop { font-family: "Times New Roman", Times, serif; font-size: 20px; fill: #1A4A6E; text-anchor: middle; }
+      .arrow-line { stroke: #1A4A6E; stroke-width: 3; fill: none; }
+    </style>
+    <marker id="arrowhead" markerWidth="10" markerHeight="10" refX="8" refY="5" orient="auto" markerUnits="strokeWidth">
+      <polygon points="0,0 10,5 0,10 2,5" fill="#1A4A6E"/>
+    </marker>
+  </defs>
+
+  <!-- Background -->
+  <rect width="650" height="320" fill="white"/>
+
+  <!-- Box 1: omega(t), psi(t) -->
+  <rect x="40" y="100" width="100" height="80" class="box"/>
+  <text x="90" y="135" class="text-math">&#969;(t)</text>
+  <text x="90" y="165" class="text-math">&#968;(t)</text>
+
+  <!-- Arrow 1 with label Eq. (1) -->
+  <path d="M 140 140 L 202 140" class="arrow-line" marker-end="url(#arrowhead)"/>
+  <text x="170" y="115" class="text-label">Eq. (1)</text>
+
+  <!-- Box 2: omega(t+Delta t) -->
+  <rect x="210" y="100" width="120" height="80" class="box"/>
+  <text x="270" y="150" class="text-math">&#969;(t+&#916;t)</text>
+
+  <!-- Arrow 2 with label Eq. (2) -->
+  <path d="M 330 140 L 392 140" class="arrow-line" marker-end="url(#arrowhead)"/>
+  <text x="360" y="115" class="text-label">Eq. (2)</text>
+
+  <!-- Box 3: psi(t+Delta t) -->
+  <rect x="400" y="100" width="120" height="80" class="box"/>
+  <text x="460" y="150" class="text-math">&#968;(t+&#916;t)</text>
+
+  <!-- Return arrow (rectangular): down, left, up -->
+  <path d="M 460 180 L 460 260 L 90 260 L 90 188" class="arrow-line" marker-end="url(#arrowhead)"/>
+
+  <!-- Label: Next timestep -->
+  <text x="275" y="290" class="text-loop">Next timestep</text>
+</svg>
diff --git a/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/omega_differential.png b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/omega_differential.png
new file mode 100644
index 00000000..1bf729ee
--- /dev/null
+++ b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/omega_differential.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:481d82b98eceb3d5bf9aaea18e1d8d7a9977a9125f6156e2a53717826df508d9
+size 80980
diff --git a/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/periodicity_ns_tiled.gif b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/periodicity_ns_tiled.gif
new file mode 100644
index 00000000..6d7f7ae7
--- /dev/null
+++ b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/periodicity_ns_tiled.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15595f47e213e02812d94fccc754e5e738bddde5759546a55175b9baa5ff544e
+size 8126464
diff --git a/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/periodicity_sin_cos.png b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/periodicity_sin_cos.png
new file mode 100644
index 00000000..70bb8502
--- /dev/null
+++ b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/periodicity_sin_cos.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d59f63e7e8ba3afd6f26a2980f4557e81e0198436771411133048bce24c61ef9
+size 80360
diff --git a/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/psi_differential.png b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/psi_differential.png
new file mode 100644
index 00000000..746289ab
--- /dev/null
+++ b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/psi_differential.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eadb82aac9b164c61b7ec411139113c2b4aa987b7120f6ed8b13575a78ac329b
+size 91791
diff --git a/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/solver_pipeline_simple.svg b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/solver_pipeline_simple.svg
new file mode 100644
index 00000000..3ac74308
--- /dev/null
+++ b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/solver_pipeline_simple.svg
@@ -0,0 +1,63 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg width="1000" height="400" xmlns="http://www.w3.org/2000/svg">
+  <defs>
+    <style>
+      .step-box { fill: #E3F2FD; stroke: #1A237E; stroke-width: 2; }
+      .field-box { fill: #FFF9C4; stroke: #1A237E; stroke-width: 2.5; }
+      .text-main { font-family: Arial, sans-serif; font-size: 16px; font-weight: bold; fill: #1A237E; text-anchor: middle; }
+      .text-large { font-family: Arial, sans-serif; font-size: 20px; font-weight: bold; fill: #1A237E; text-anchor: middle; }
+      .text-small { font-family: Arial, sans-serif; font-size: 14px; font-style: italic; fill: #1A237E; text-anchor: middle; }
+      .text-label { font-family: Arial, sans-serif; font-size: 14px; font-style: italic; font-weight: bold; fill: #1976D2; text-anchor: middle; }
+      .text-title { font-family: Arial, sans-serif; font-size: 22px; font-weight: bold; fill: #1A237E; text-anchor: middle; }
+      .arrow { stroke: #1976D2; stroke-width: 3; fill: none; stroke-linecap: round; stroke-linejoin: round; marker-end: url(#arrowhead); }
+      .arrow-plain { stroke: #1976D2; stroke-width: 3; fill: none; stroke-linecap: round; stroke-linejoin: round; }
+    </style>
+    <marker id="arrowhead" markerWidth="8" markerHeight="8" refX="6" refY="3" orient="auto" markerUnits="strokeWidth">
+      <path d="M 0 0 L 7 3 L 0 6 Q 2 3 0 0" fill="#1976D2" />
+    </marker>
+  </defs>
+
+  <!-- Background -->
+  <rect width="1000" height="400" fill="white"/>
+
+  <!-- Flow at y=200 -->
+
+  <!-- 1. ω(t), ψ(t) -->
+  <rect x="50" y="160" width="120" height="80" rx="10" class="field-box"/>
+  <text x="110" y="195" class="text-large">ω(t)</text>
+  <text x="110" y="225" class="text-large">ψ(t)</text>
+
+  <!-- Arrow 1 -->
+  <path d="M 170 200 L 230 200" class="arrow"/>
+
+  <!-- 2. Discretization + Time Marching -->
+  <rect x="230" y="160" width="220" height="80" rx="10" class="step-box"/>
+  <text x="340" y="195" class="text-main">Discretization +</text>
+  <text x="340" y="215" class="text-main">Time Marching</text>
+
+  <!-- Arrow 2 -->
+  <path d="M 450 200 L 510 200" class="arrow"/>
+
+  <!-- 3. ω(t+Δt) -->
+  <rect x="510" y="160" width="130" height="80" rx="10" class="field-box"/>
+  <text x="575" y="210" class="text-large">ω(t+Δt)</text>
+
+  <!-- Arrow 3 -->
+  <path d="M 640 200 L 700 200" class="arrow"/>
+
+  <!-- 4. Fourier Poisson Solver -->
+  <rect x="680" y="160" width="240" height="80" rx="10" class="step-box"/>
+  <text x="800" y="195" class="text-main">Fourier Poisson Solver</text>
+  <text x="800" y="215" class="text-small">for ψ(t+Δt)</text>
+
+  <!-- Return path -->
+  <!-- Down with arrow from bottom of Poisson Solve box -->
+  <path d="M 800 240 L 800 320" class="arrow"/>
+  <!-- Left -->
+  <path d="M 800 320 L 110 320" class="arrow-plain"/>
+  <!-- Up with arrow -->
+  <path d="M 110 320 L 110 240" class="arrow"/>
+
+  <!-- Label on return arrow -->
+  <text x="500" y="355" class="text-label">Next timestep</text>
+</svg>
diff --git a/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/solver_pipeline_simple_discretization.svg b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/solver_pipeline_simple_discretization.svg
new file mode 100644
index 00000000..5b08a515
--- /dev/null
+++ b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/solver_pipeline_simple_discretization.svg
@@ -0,0 +1,63 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg width="1000" height="400" xmlns="http://www.w3.org/2000/svg">
+  <defs>
+    <style>
+      .step-box { fill: #E3F2FD; stroke: #1A237E; stroke-width: 2; }
+      .field-box { fill: #FFF9C4; stroke: #1A237E; stroke-width: 2.5; }
+      .text-main { font-family: Arial, sans-serif; font-size: 16px; font-weight: bold; fill: #1A237E; text-anchor: middle; }
+      .text-large { font-family: Arial, sans-serif; font-size: 20px; font-weight: bold; fill: #1A237E; text-anchor: middle; }
+      .text-small { font-family: Arial, sans-serif; font-size: 14px; font-style: italic; fill: #1A237E; text-anchor: middle; }
+      .text-label { font-family: Arial, sans-serif; font-size: 14px; font-style: italic; font-weight: bold; fill: #1976D2; text-anchor: middle; }
+      .text-title { font-family: Arial, sans-serif; font-size: 22px; font-weight: bold; fill: #1A237E; text-anchor: middle; }
+      .arrow { stroke: #1976D2; stroke-width: 3; fill: none; stroke-linecap: round; stroke-linejoin: round; marker-end: url(#arrowhead); }
+      .arrow-plain { stroke: #1976D2; stroke-width: 3; fill: none; stroke-linecap: round; stroke-linejoin: round; }
+    </style>
+    <marker id="arrowhead" markerWidth="8" markerHeight="8" refX="6" refY="3" orient="auto" markerUnits="strokeWidth">
+      <path d="M 0 0 L 7 3 L 0 6 Q 2 3 0 0" fill="#1976D2" />
+    </marker>
+  </defs>
+
+  <!-- Background -->
+  <rect width="1000" height="400" fill="white"/>
+
+  <!-- Flow at y=200 -->
+
+  <!-- 1. ω(t), ψ(t) -->
+  <rect x="50" y="160" width="120" height="80" rx="10" class="field-box"/>
+  <text x="110" y="195" class="text-large">ω(t)</text>
+  <text x="110" y="225" class="text-large">ψ(t)</text>
+
+  <!-- Arrow 1 -->
+  <path d="M 170 200 L 230 200" class="arrow"/>
+
+  <!-- 2. Discretization + Time Marching (HIGHLIGHTED) -->
+  <rect x="230" y="160" width="220" height="80" rx="10" class="step-box" style="stroke-width: 5;"/>
+  <text x="340" y="195" class="text-main">Discretization +</text>
+  <text x="340" y="215" class="text-main">Time Marching</text>
+
+  <!-- Arrow 2 -->
+  <path d="M 450 200 L 510 200" class="arrow"/>
+
+  <!-- 3. ω(t+Δt) -->
+  <rect x="510" y="160" width="130" height="80" rx="10" class="field-box"/>
+  <text x="575" y="210" class="text-large">ω(t+Δt)</text>
+
+  <!-- Arrow 3 -->
+  <path d="M 640 200 L 700 200" class="arrow"/>
+
+  <!-- 4. Fourier Poisson Solver -->
+  <rect x="680" y="160" width="240" height="80" rx="10" class="step-box"/>
+  <text x="800" y="195" class="text-main">Fourier Poisson Solver</text>
+  <text x="800" y="215" class="text-small">for ψ(t+Δt)</text>
+
+  <!-- Return path -->
+  <!-- Down with arrow from bottom of Poisson Solve box -->
+  <path d="M 800 240 L 800 320" class="arrow"/>
+  <!-- Left -->
+  <path d="M 800 320 L 110 320" class="arrow-plain"/>
+  <!-- Up with arrow -->
+  <path d="M 110 320 L 110 240" class="arrow"/>
+
+  <!-- Label on return arrow -->
+  <text x="500" y="355" class="text-label">Next timestep</text>
+</svg>
diff --git a/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/solver_pipeline_simple_poisson.svg b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/solver_pipeline_simple_poisson.svg
new file mode 100644
index 00000000..2c2a1282
--- /dev/null
+++ b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/solver_pipeline_simple_poisson.svg
@@ -0,0 +1,63 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg width="1000" height="400" xmlns="http://www.w3.org/2000/svg">
+  <defs>
+    <style>
+      .step-box { fill: #E3F2FD; stroke: #1A237E; stroke-width: 2; }
+      .field-box { fill: #FFF9C4; stroke: #1A237E; stroke-width: 2.5; }
+      .text-main { font-family: Arial, sans-serif; font-size: 16px; font-weight: bold; fill: #1A237E; text-anchor: middle; }
+      .text-large { font-family: Arial, sans-serif; font-size: 20px; font-weight: bold; fill: #1A237E; text-anchor: middle; }
+      .text-small { font-family: Arial, sans-serif; font-size: 14px; font-style: italic; fill: #1A237E; text-anchor: middle; }
+      .text-label { font-family: Arial, sans-serif; font-size: 14px; font-style: italic; font-weight: bold; fill: #1976D2; text-anchor: middle; }
+      .text-title { font-family: Arial, sans-serif; font-size: 22px; font-weight: bold; fill: #1A237E; text-anchor: middle; }
+      .arrow { stroke: #1976D2; stroke-width: 3; fill: none; stroke-linecap: round; stroke-linejoin: round; marker-end: url(#arrowhead); }
+      .arrow-plain { stroke: #1976D2; stroke-width: 3; fill: none; stroke-linecap: round; stroke-linejoin: round; }
+    </style>
+    <marker id="arrowhead" markerWidth="8" markerHeight="8" refX="6" refY="3" orient="auto" markerUnits="strokeWidth">
+      <path d="M 0 0 L 7 3 L 0 6 Q 2 3 0 0" fill="#1976D2" />
+    </marker>
+  </defs>
+
+  <!-- Background -->
+  <rect width="1000" height="400" fill="white"/>
+
+  <!-- Flow at y=200 -->
+
+  <!-- 1. ω(t), ψ(t) -->
+  <rect x="50" y="160" width="120" height="80" rx="10" class="field-box"/>
+  <text x="110" y="195" class="text-large">ω(t)</text>
+  <text x="110" y="225" class="text-large">ψ(t)</text>
+
+  <!-- Arrow 1 -->
+  <path d="M 170 200 L 230 200" class="arrow"/>
+
+  <!-- 2. Discretization + Time Marching -->
+  <rect x="230" y="160" width="220" height="80" rx="10" class="step-box"/>
+  <text x="340" y="195" class="text-main">Discretization +</text>
+  <text x="340" y="215" class="text-main">Time Marching</text>
+
+  <!-- Arrow 2 -->
+  <path d="M 450 200 L 510 200" class="arrow"/>
+
+  <!-- 3. ω(t+Δt) -->
+  <rect x="510" y="160" width="130" height="80" rx="10" class="field-box"/>
+  <text x="575" y="210" class="text-large">ω(t+Δt)</text>
+
+  <!-- Arrow 3 -->
+  <path d="M 640 200 L 700 200" class="arrow"/>
+
+  <!-- 4. Fourier Poisson Solver (HIGHLIGHTED) -->
+  <rect x="680" y="160" width="240" height="80" rx="10" class="step-box" style="stroke-width: 5;"/>
+  <text x="800" y="195" class="text-main">Fourier Poisson Solver</text>
+  <text x="800" y="215" class="text-small">for ψ(t+Δt)</text>
+
+  <!-- Return path -->
+  <!-- Down with arrow from bottom of Poisson Solve box -->
+  <path d="M 800 240 L 800 320" class="arrow"/>
+  <!-- Left -->
+  <path d="M 800 320 L 110 320" class="arrow-plain"/>
+  <!-- Up with arrow -->
+  <path d="M 110 320 L 110 240" class="arrow"/>
+
+  <!-- Label on return arrow -->
+  <text x="500" y="355" class="text-label">Next timestep</text>
+</svg>
diff --git a/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/stencil.png b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/stencil.png
new file mode 100644
index 00000000..0ba414af
--- /dev/null
+++ b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/stencil.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6919b78abe5a9901632c1edc41ffe4fde1f6e5ac0a93c030ceb8df13bae6e61c
+size 100508
diff --git a/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/structured_vs_unstructured_grid.png b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/structured_vs_unstructured_grid.png
new file mode 100644
index 00000000..a526daec
--- /dev/null
+++ b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/structured_vs_unstructured_grid.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc6e33de25df8dcd4ffa3e7643d2178fa4fd49f734563144e48416ff97796fd7
+size 5700907
diff --git a/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/turbulence_256x256_Re1000.gif b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/turbulence_256x256_Re1000.gif
new file mode 100644
index 00000000..53773cb3
--- /dev/null
+++ b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/turbulence_256x256_Re1000.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab3dd5dce0f7d0df0bd9a2b261368398b32955f04ab7de74e67238c3107471ee
+size 6712347
diff --git a/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/vorticity_grid_overlay.png b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/vorticity_grid_overlay.png
new file mode 100644
index 00000000..1acbe88c
--- /dev/null
+++ b/Accelerated_Python_User_Guide/notebooks/images/chapter-12.2/vorticity_grid_overlay.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b10653049c5288e147f634b341ea284ca706b56371f4ec068ef0dba3095bf78
+size 2695455
diff --git a/Accelerated_Python_User_Guide/notebooks/utils.py b/Accelerated_Python_User_Guide/notebooks/utils.py
new file mode 100644
index 00000000..fa08403b
--- /dev/null
+++ b/Accelerated_Python_User_Guide/notebooks/utils.py
@@ -0,0 +1,449 @@
+# SPDX-License-Identifier: Apache-2.0 AND CC-BY-NC-4.0
+"""Utility functions for validating FFT kernels in the 2D Navier-Stokes solver."""
+
+import numpy as np
+import warp as wp
+import matplotlib.pyplot as plt
+
+
+def validate_fft_roundtrip(
+    fft_kernel,
+    ifft_kernel,
+    n_grid: int,
+    tile_m: int,
+    tile_n: int,
+    block_dim: int,
+    figsize=(14, 4),
+):
+    """Validate FFT and IFFT kernels using a sine wave test.
+
+    Creates a sin(x) function on [0, 2\pi], applies FFT to show the amplitude
+    spectrum (should peak at k=1), then applies IFFT and compares with original.
+
+    Args:
+        fft_kernel: The fft_tiled Warp kernel.
+        ifft_kernel: The ifft_tiled Warp kernel.
+        n_grid: Grid resolution (must match TILE_N used in kernel compilation).
+        tile_m: TILE_M parameter (typically 1 for row-wise FFT).
+        tile_n: TILE_N parameter (typically N_GRID).
+        block_dim: Block dimension for kernel launch.
+        figsize: Figure size tuple.
+
+    Returns:
+        fig, axes: Matplotlib figure and axes.
+        max_error: Maximum absolute error between original and reconstructed.
+    """
+    # Create physical grid on [0, 2\pi)
+    L = 2.0 * np.pi
+    x = np.linspace(0, L, n_grid, endpoint=False)
+
+    # Create test signal: sin(x) which has a peak at k=1
+    # Since f(x) = sin(x) = (e^{ix} - e^{-ix}) / 2i, FFT should show peaks at k=\pm 1
+    signal = np.sin(x)
+
+    # Prepare 2D array (fft_tiled operates on rows of 2D arrays)
+    # We'll use a single row for this 1D test
+    signal_2d = signal.reshape(1, n_grid).astype(np.float32)
+
+    # Convert to complex (wp.vec2f format): [real, imag]
+    # Shape must be (1, n_grid, 2) for Warp to interpret as (1, n_grid) of vec2f
+    signal_complex = np.zeros((1, n_grid, 2), dtype=np.float32)
+    signal_complex[:, :, 0] = signal_2d  # Real part
+    signal_complex[:, :, 1] = 0.0  # Imaginary part = 0
+
+    # Create Warp arrays - pass shape (1, n_grid) and let Warp handle vec2f
+    input_wp = wp.array(signal_complex, dtype=wp.vec2f, shape=(1, n_grid))
+    fft_result = wp.zeros((1, n_grid), dtype=wp.vec2f)
+    ifft_result = wp.zeros((1, n_grid), dtype=wp.vec2f)
+
+    # Perform FFT
+    wp.launch_tiled(
+        fft_kernel,
+        dim=[1, 1],
+        inputs=[input_wp],
+        outputs=[fft_result],
+        block_dim=block_dim,
+    )
+
+    # Perform IFFT
+    wp.launch_tiled(
+        ifft_kernel,
+        dim=[1, 1],
+        inputs=[fft_result],
+        outputs=[ifft_result],
+        block_dim=block_dim,
+    )
+
+    # Synchronize and get results
+    wp.synchronize()
+
+    # Convert back to numpy
+    fft_np = fft_result.numpy()
+    ifft_np = ifft_result.numpy()
+
+    # Extract real and imaginary parts from FFT result
+    fft_real = fft_np[0, :, 0]
+    fft_imag = fft_np[0, :, 1]
+    fft_magnitude = np.sqrt(fft_real**2 + fft_imag**2)
+
+    # IFFT result (need to normalize by N)
+    reconstructed = ifft_np[0, :, 0] / n_grid  # Real part, normalized
+
+    # Wavenumbers for FFT (standard ordering: 0, 1, 2, ..., N/2-1, -N/2, ..., -1)
+    k = np.fft.fftfreq(n_grid, d=L / (2 * np.pi * n_grid))
+
+    # Compute error
+    max_error = np.max(np.abs(signal - reconstructed))
+
+    # Create figure with 3 panels
+    fig, axes = plt.subplots(1, 3, figsize=figsize)
+
+    # Panel 1: Original signal
+    axes[0].plot(x, signal, "b-", linewidth=2, label="sin(x)")
+    axes[0].set_xlabel("x")
+    axes[0].set_ylabel("f(x)")
+    axes[0].set_title("Original Signal: f(x) = sin(x)")
+    axes[0].set_xlim([0, L])
+    axes[0].grid(True, alpha=0.3)
+    axes[0].legend()
+
+    # Panel 2: FFT magnitude spectrum
+    # Shift for better visualization (put k=0 at center)
+    k_shifted = np.fft.fftshift(k)
+    mag_shifted = np.fft.fftshift(fft_magnitude)
+
+    axes[1].stem(k_shifted, mag_shifted, basefmt=" ")
+    axes[1].set_xlabel("Wavenumber k")
+    axes[1].set_ylabel(r"$|\hat{f}(k)|$")
+    axes[1].set_title("FFT Amplitude Spectrum")
+    axes[1].set_xlim([-10, 10])  # Focus on low wavenumbers
+    axes[1].grid(True, alpha=0.3)
+    axes[1].axvline(x=1, color="r", linestyle="--", alpha=0.5, label="k=1")
+    axes[1].axvline(x=-1, color="r", linestyle="--", alpha=0.5, label="k=-1")
+    axes[1].legend()
+
+    # Panel 3: Comparison of original vs reconstructed
+    axes[2].plot(x, signal, "b-", linewidth=2, label="Original")
+    axes[2].plot(x, reconstructed, "r--", linewidth=2, label="FFT→IFFT")
+    axes[2].set_xlabel("x")
+    axes[2].set_ylabel("f(x)")
+    axes[2].set_title(f"Comparison b/w original and reconstructed signal")
+    axes[2].set_xlim([0, L])
+    axes[2].grid(True, alpha=0.3)
+    axes[2].legend()
+
+    plt.tight_layout()
+
+    return fig, axes, max_error
+
+
+def validate_transpose(
+    transpose_kernel,
+    tile_transpose_dim: int,
+    n_test: int = 64,
+    figsize=(10, 4),
+):
+    """Validate the tiled transpose kernel with a visual comparison.
+
+    Creates an upper triangular matrix (1s above diagonal, 0s below),
+    applies transpose, and shows before/after.
+
+    Args:
+        transpose_kernel: The tiled_transpose Warp kernel.
+        tile_transpose_dim: TILE_TRANSPOSE_DIM parameter used in kernel.
+        n_test: Size of the test matrix (default 64x64).
+        figsize: Figure size tuple.
+
+    Returns:
+        fig, axes: Matplotlib figure and axes.
+    """
+    # Create upper triangular matrix: 1s above diagonal, 0s below
+    pattern = np.triu(np.ones((n_test, n_test), dtype=np.float32))
+
+    # Convert to complex format (wp.vec2f) - store pattern in real part
+    input_complex = np.zeros((n_test, n_test, 2), dtype=np.float32)
+    input_complex[:, :, 0] = pattern
+
+    # Create Warp arrays
+    input_wp = wp.array(input_complex, dtype=wp.vec2f, shape=(n_test, n_test))
+    output_wp = wp.zeros((n_test, n_test), dtype=wp.vec2f)
+
+    # Launch transpose kernel
+    wp.launch_tiled(
+        transpose_kernel,
+        dim=(n_test // tile_transpose_dim, n_test // tile_transpose_dim),
+        inputs=[input_wp],
+        outputs=[output_wp],
+        block_dim=tile_transpose_dim * tile_transpose_dim,
+    )
+    wp.synchronize()
+
+    # Get result
+    transposed_result = output_wp.numpy()[:, :, 0]
+
+    # Plot
+    fig, axes = plt.subplots(1, 2, figsize=figsize)
+
+    axes[0].imshow(pattern, cmap="gray_r")
+    axes[0].set_title("Original")
+    axes[0].set_xlabel("j")
+    axes[0].set_ylabel("i")
+
+    axes[1].imshow(transposed_result, cmap="gray_r")
+    axes[1].set_title("After transpose")
+    axes[1].set_xlabel("j")
+    axes[1].set_ylabel("i")
+
+    plt.tight_layout()
+    return fig, axes
+
+
+def validate_poisson_solver(
+    solve_poisson_func,
+    n_grid: int,
+    kx: int = 1,
+    ky: int = 1,
+    figsize=(14, 4),
+):
+    """Validate the Poisson solver using an analytical test case.
+
+    Tests with \omega = (kx^2 + ky^2)sin(kx·x)sin(ky·y), which has the analytical
+    solution psi = sin(kx·x)sin(ky·y) for the Poisson equation \laplacian \psi = -\omega.
+
+    Args:
+        solve_poisson_func: The solve_poisson function to validate.
+        n_grid: Grid resolution.
+        kx: Wavenumber in x-direction (default 1).
+        ky: Wavenumber in y-direction (default 1).
+        figsize: Figure size tuple.
+
+    Returns:
+        fig, axes: Matplotlib figure and axes.
+        max_error: Maximum absolute error between computed and analytical solution.
+    """
+    import warp as wp
+
+    # Create 2D grid on [0, 2\pi) \times [0, 2\pi)
+    L = 2.0 * np.pi
+    x = np.linspace(0, L, n_grid, endpoint=False)
+    y = np.linspace(0, L, n_grid, endpoint=False)
+    X, Y = np.meshgrid(x, y)
+
+    # Analytical solution: \psi = sin(kx·x)sin(ky·y)
+    psi_analytical = np.sin(kx * X) * np.sin(ky * Y)
+
+    # Corresponding vorticity: \omega = (kx^2 + ky^2)sin(kx·x)sin(ky·y)
+    # From \laplacian\psi = -\omega, we have \omega = -\laplacian\psi = (kx^2 + ky^2)\psi
+    k_squared = kx**2 + ky**2
+    omega = k_squared * psi_analytical
+
+    # Precompute 1/|k|^2 for spectral Poisson solver
+    k_freq = np.fft.fftfreq(n_grid, d=1.0 / n_grid)
+    kx_grid, ky_grid = np.meshgrid(k_freq, k_freq)
+    k2 = kx_grid**2 + ky_grid**2
+    k2i_np = np.zeros_like(k2)
+    nonzero = k2 != 0
+    k2i_np[nonzero] = 1.0 / k2[nonzero]
+    k2i = wp.array(k2i_np.astype(np.float32), dtype=wp.float32)
+
+    # Create Warp arrays for omega and psi
+    omega_wp = wp.array(omega.astype(np.float32), dtype=wp.float32)
+    psi_wp = wp.zeros((n_grid, n_grid), dtype=wp.float32)
+
+    # Solve Poisson equation
+    solve_poisson_func(omega_wp, psi_wp, k2i)
+    wp.synchronize()
+
+    # Get computed solution
+    psi_computed = psi_wp.numpy()
+
+    # Compute error
+    error = np.abs(psi_computed - psi_analytical)
+    max_error = np.max(error)
+
+    # Create visualization
+    fig, axes = plt.subplots(1, 3, figsize=figsize)
+
+    # Common colormap settings
+    vmin = min(psi_analytical.min(), psi_computed.min())
+    vmax = max(psi_analytical.max(), psi_computed.max())
+
+    # Panel 1: Computed solution
+    im0 = axes[0].imshow(
+        psi_computed,
+        extent=[0, L, 0, L],
+        origin="lower",
+        cmap="RdBu_r",
+        vmin=vmin,
+        vmax=vmax,
+    )
+    axes[0].set_title(r"Computed $\psi$")
+    axes[0].set_xlabel("x")
+    axes[0].set_ylabel("y")
+    plt.colorbar(im0, ax=axes[0], shrink=0.8)
+
+    # Panel 2: Analytical solution
+    im1 = axes[1].imshow(
+        psi_analytical,
+        extent=[0, L, 0, L],
+        origin="lower",
+        cmap="RdBu_r",
+        vmin=vmin,
+        vmax=vmax,
+    )
+    axes[1].set_title(r"Analytical $\psi = \sin(x)\sin(y)$")
+    axes[1].set_xlabel("x")
+    axes[1].set_ylabel("y")
+    plt.colorbar(im1, ax=axes[1], shrink=0.8)
+
+    # Panel 3: Error
+    im2 = axes[2].imshow(error, extent=[0, L, 0, L], origin="lower", cmap="hot")
+    axes[2].set_title(f"Error")
+    axes[2].set_xlabel("x")
+    axes[2].set_ylabel("y")
+    plt.colorbar(im2, ax=axes[2], shrink=0.8)
+
+    plt.tight_layout()
+    return fig, axes, max_error
+
+
+def validate_advection_diffusion(
+    advection_diffusion_kernel,
+    n_grid: int,
+    kx: int = 2,
+    ky: int = 3,
+    figsize=(10, 4),
+):
+    """Validate finite-difference advection and diffusion using analytical test case.
+
+    Uses test functions:
+        omega = (kx^2 + ky^2) * sin(kx*x) * sin(ky*y)
+        psi = sin(kx*x) * sin(ky*y)
+
+    Analytical results:
+        Diffusion: nabla^2 omega = -(kx^2 + ky^2)^2 * sin(kx*x) * sin(ky*y)
+        Advection: J(psi, omega) = 0 (since omega = c * psi with constant c)
+
+    Args:
+        advection_diffusion_kernel: Warp kernel with signature:
+            (omega, psi, advection_out, diffusion_out, h, n) where all arrays
+            are wp.array2d(dtype=wp.float32), h is wp.float32, n is wp.int32.
+        n_grid: Grid resolution.
+        kx: Wavenumber in x-direction.
+        ky: Wavenumber in y-direction.
+        figsize: Figure size for each 1x2 panel.
+
+    Returns:
+        (fig_diff, axes_diff): Matplotlib figure/axes for diffusion comparison.
+        (fig_adv, axes_adv): Matplotlib figure/axes for advection comparison.
+        max_diff_error: Maximum absolute error in diffusion.
+        max_adv_error: Maximum absolute error in advection.
+    """
+    # Create 2D grid on [0, 2*pi) x [0, 2*pi)
+    L = 2.0 * np.pi
+    h = L / n_grid
+    x = np.linspace(0, L, n_grid, endpoint=False)
+    y = np.linspace(0, L, n_grid, endpoint=False)
+    X, Y = np.meshgrid(x, y)
+
+    # Analytical fields
+    k_squared = kx**2 + ky**2
+    psi_analytical = np.sin(kx * X) * np.sin(ky * Y)
+    omega_analytical = k_squared * psi_analytical
+
+    # Analytical diffusion: nabla^2 omega = -(kx^2 + ky^2) * omega
+    diffusion_analytical = -k_squared * omega_analytical
+
+    # Analytical advection: J(psi, omega) = 0 (since omega = c * psi)
+    advection_analytical = np.zeros_like(omega_analytical)
+
+    # Create Warp arrays
+    omega_wp = wp.array(omega_analytical.astype(np.float32), dtype=wp.float32)
+    psi_wp = wp.array(psi_analytical.astype(np.float32), dtype=wp.float32)
+    advection_wp = wp.zeros((n_grid, n_grid), dtype=wp.float32)
+    diffusion_wp = wp.zeros((n_grid, n_grid), dtype=wp.float32)
+
+    # Launch user-provided kernel
+    wp.launch(
+        advection_diffusion_kernel,
+        dim=(n_grid, n_grid),
+        inputs=[omega_wp, psi_wp, advection_wp, diffusion_wp, float(h), n_grid],
+    )
+    wp.synchronize()
+
+    # Get numerical results
+    diffusion_numerical = diffusion_wp.numpy()
+    advection_numerical = advection_wp.numpy()
+
+    # --- Diffusion comparison plot (1x2 panel) ---
+    fig_diff, axes_diff = plt.subplots(1, 2, figsize=figsize)
+
+    vmin_d = min(diffusion_analytical.min(), diffusion_numerical.min())
+    vmax_d = max(diffusion_analytical.max(), diffusion_numerical.max())
+
+    im0 = axes_diff[0].imshow(
+        diffusion_analytical,
+        extent=[0, L, 0, L],
+        origin="lower",
+        cmap="RdBu_r",
+        vmin=vmin_d,
+        vmax=vmax_d,
+    )
+    axes_diff[0].set_title(r"Analytical $\nabla^2\omega$")
+    axes_diff[0].set_xlabel("x")
+    axes_diff[0].set_ylabel("y")
+    plt.colorbar(im0, ax=axes_diff[0], shrink=0.8)
+
+    im1 = axes_diff[1].imshow(
+        diffusion_numerical,
+        extent=[0, L, 0, L],
+        origin="lower",
+        cmap="RdBu_r",
+        vmin=vmin_d,
+        vmax=vmax_d,
+    )
+    axes_diff[1].set_title(rf"Numerical Diffusion")
+    axes_diff[1].set_xlabel("x")
+    axes_diff[1].set_ylabel("y")
+    plt.colorbar(im1, ax=axes_diff[1], shrink=0.8)
+
+    fig_diff.suptitle(f"Diffusion Validation (kx={kx}, ky={ky}, N={n_grid})", y=1.02)
+    fig_diff.tight_layout()
+
+    # --- Advection comparison plot (1x2 panel) ---
+    fig_adv, axes_adv = plt.subplots(1, 2, figsize=figsize)
+
+    # For advection, analytical is zero, so set symmetric limits around numerical
+    adv_max = max(np.abs(advection_numerical).max(), 1e-10)
+    vmin_a, vmax_a = -adv_max, adv_max
+
+    im2 = axes_adv[0].imshow(
+        advection_analytical,
+        extent=[0, L, 0, L],
+        origin="lower",
+        cmap="RdBu_r",
+        vmin=vmin_a,
+        vmax=vmax_a,
+    )
+    axes_adv[0].set_title(r"Analytical Advection (= 0)")
+    axes_adv[0].set_xlabel("x")
+    axes_adv[0].set_ylabel("y")
+    plt.colorbar(im2, ax=axes_adv[0], shrink=0.8)
+
+    im3 = axes_adv[1].imshow(
+        advection_numerical,
+        extent=[0, L, 0, L],
+        origin="lower",
+        cmap="RdBu_r",
+        vmin=vmin_a,
+        vmax=vmax_a,
+    )
+    axes_adv[1].set_title(rf"Numerical Advection")
+    axes_adv[1].set_xlabel("x")
+    axes_adv[1].set_ylabel("y")
+    plt.colorbar(im3, ax=axes_adv[1], shrink=0.8)
+
+    fig_adv.suptitle(f"Advection Validation (kx={kx}, ky={ky}, N={n_grid})", y=1.02)
+    fig_adv.tight_layout()
+
+    return (fig_diff, axes_diff), (fig_adv, axes_adv)