From f6739bfa00bc6ad8c33b8d4bd4a901cecb871cfe Mon Sep 17 00:00:00 2001
From: Bryce Adelstein Lelbach aka wash <brycelelbach@gmail.com>
Date: Mon, 23 Feb 2026 16:27:54 -0500
Subject: [PATCH] Tests: Normalize notebok metadata and add tests for notebook
 formats and metadata.

---
 .../workflows/pr-comment-notebook-format.yml  |  100 +
 .github/workflows/test-notebook-format.yml    |   47 +
 .../Chapter_01_GPU_Computing_Basics.ipynb     |    6 +
 .../Chapter_02_Brief_Intro_to_CUDA.ipynb      |    6 +
 .../Chapter_03_Python_on_the_GPU.ipynb        |    6 +
 ...er_04_Scientific_Computing_with_CuPy.ipynb |    6 +
 .../Chapter_05_CUDA_Kernels_with_Numba.ipynb  |    6 +
 .../Chapter_06_Intro_to_nvmath-python.ipynb   |   10 +-
 .../notebooks/Chapter_07_Intro_to_cuDF.ipynb  |    6 +
 .../notebooks/Chapter_08_Intro_to_cuML.ipynb  |    6 +
 .../Chapter_09_Intro_to_cuGraph.ipynb         |    8 +-
 .../Chapter_10_Developer_Tools.ipynb          |   21 +-
 ...11_Distributed_Computing_cuPyNumeric.ipynb |    8 +-
 .../Chapter_12.1_IsingModel_In_Warp.ipynb     |   80 +-
 .../Chapter_12_Intro_to_NVIDIA_Warp.ipynb     |    6 +-
 .../Chapter_cuda.cccl.parallel.ipynb          |   22 +-
 .../notebooks/Chapter_cuda.core.ipynb         |   21 +-
 .../notebooks/Chapter_nvshmem4py.ipynb        |    8 +-
 .../notebooks/Chapter_nvshmem4py_device.ipynb |   22 +-
 brev/test-notebook-format.py                  |  398 +++
 .../notebooks/distributed/60__mpi4py.ipynb    |    7 +-
 .../notebooks/distributed/61__dask.ipynb      |    2 +-
 .../01__numpy_intro__ndarray_basics.ipynb     |  715 ++--
 ...y_linear_algebra__svd_reconstruction.ipynb |   17 +-
 .../03__numpy_to_cupy__ndarray_basics.ipynb   | 1011 +++---
 ...4__numpy_to_cupy__svd_reconstruction.ipynb |  867 ++---
 .../05__memory_spaces__power_iteration.ipynb  |   20 +-
 .../06__asynchrony__power_iteration.ipynb     |  859 ++---
 ...uda_core__devices_streams_and_memory.ipynb |   10 +-
 ...umpy_intro__ndarray_basics__SOLUTION.ipynb |  725 ++--
 ...lgebra__svd_reconstruction__SOLUTION.ipynb |   17 +-
 ...py_to_cupy__ndarray_basics__SOLUTION.ipynb | 1180 +++----
 ...o_cupy__svd_reconstruction__SOLUTION.ipynb |    7 +-
 ...ry_spaces__power_iteration__SOLUTION.ipynb |   20 +-
 ...synchrony__power_iteration__SOLUTION.ipynb |   17 +-
 ...devices_streams_and_memory__SOLUTION.ipynb |    6 +-
 .../kernels/40__kernel_authoring__copy.ipynb  |   17 +-
 ...41__kernel_authoring__book_histogram.ipynb |   15 +-
 .../42__kernel_authoring__gaussian_blur.ipynb |    2 +-
 ...3__kernel_authoring__black_and_white.ipynb |    2 +-
 ...40__kernel_authoring__copy__SOLUTION.ipynb | 2467 +-------------
 ..._authoring__book_histogram__SOLUTION.ipynb | 2628 +--------------
 ...authoring__black_and_white__SOLUTION.ipynb |    2 +-
 .../20__cudf__nyc_parking_violations.ipynb    | 1324 ++++----
 ..._cudf_pandas__nyc_parking_violations.ipynb |   12 +-
 .../notebooks/libraries/22__cuml.ipynb        |    2 +-
 ...3__cuda_cccl__customizing_algorithms.ipynb | 2940 +++++++++--------
 .../24__nvmath_python__interop.ipynb          |   12 +-
 .../25__nvmath_python__kernel_fusion.ipynb    |   11 +-
 .../26__nvmath_python__stateful_apis.ipynb    |   11 +-
 .../27__nvmath_python__scaling.ipynb          |   11 +-
 .../notebooks/libraries/28__pynvml.ipynb      |    2 +-
 ...df__nyc_parking_violations__SOLUTION.ipynb | 1364 ++++----
 ...as__nyc_parking_violations__SOLUTION.ipynb |   12 +-
 ...cl__customizing_algorithms__SOLUTION.ipynb |    6 +-
 .../accelerated-python/notebooks/start.ipynb  |   15 +-
 ...cuda_python__cuda_core_cccl__2_hours.ipynb |   15 +-
 ...hon__cupy_cudf_cccl_kernels__8_hours.ipynb |   15 +-
 .../pyhpc__numpy_cupy_mpi4py__4_hours.ipynb   |   15 +-
 .../01.01.01-CUDA-Made-Easy.ipynb             |    4 +-
 .../01.02.01-Execution-Spaces.ipynb           |    6 +-
 ...2-Exercise-Annotate-Execution-Spaces.ipynb |    6 +-
 ...03-Exercise-Changing-Execution-Space.ipynb |    6 +-
 ...-Exercise-Compute-Median-Temperature.ipynb |    6 +-
 .../01.03.01-Extending-Algorithms.ipynb       |    4 +-
 ...01.03.02-Exercise-Computing-Variance.ipynb |    6 +-
 .../01.04.01-Vocabulary-Types.ipynb           |    6 +-
 .../01.04.02-Exercise-mdspan.ipynb            |    6 +-
 .../01.05.01-Serial-vs-Parallel.ipynb         |    6 +-
 ...-Exercise-Segmented-Sum-Optimization.ipynb |    6 +-
 .../01.05.03-Exercise-Segmented-Mean.ipynb    |    6 +-
 .../01.06.01-Memory-Spaces.ipynb              |    6 +-
 .../01.06.02-Exercise-Copy.ipynb              |    6 +-
 .../01.07-Summary/01.07.01-Summary.ipynb      |    4 +-
 .../01.08-Advanced/01.08.01-Advanced.ipynb    |    4 +-
 .../02.01.01-Introduction.ipynb               |    4 +-
 .../02.02.01-Asynchrony.ipynb                 |    6 +-
 ...02.02.02-Exercise-Compute-IO-Overlap.ipynb |    6 +-
 .../02.02.03-Exercise-Nsight.ipynb            |    6 +-
 .../02.02.04-Exercise-NVTX.ipynb              |    6 +-
 .../02.03-Streams/02.03.01-Streams.ipynb      |    4 +-
 .../02.03.02-Exercise-Async-Copy.ipynb        |    6 +-
 .../02.04-Pinned-Memory/02.04.01-Pinned.ipynb |    4 +-
 .../02.04.02-Exercise-Copy-Overlap.ipynb      |    6 +-
 .../03.01-Introduction.ipynb                  |    4 +-
 .../03.02-Kernels/03.02.01-Kernels.ipynb      |    6 +-
 .../03.02.02-Exercise-Symmetry.ipynb          |    4 +-
 .../03.02.03-Exercise-Row-Symmetry.ipynb      |    4 +-
 .../03.02-Kernels/03.02.04-Dev-Tools.ipynb    |    4 +-
 .../03.03-Atomics/03.03.01-Histogram.ipynb    |    6 +-
 .../03.03.02-Exercise-Fix-Histogram.ipynb     |    4 +-
 .../03.04-Synchronization/03.04.01-Sync.ipynb |    6 +-
 .../03.04.02-Exercise-Histogram.ipynb         |    4 +-
 .../03.05-Shared-Memory/03.05.01-Shared.ipynb |    4 +-
 ...03.05.02-Exercise-Optimize-Histogram.ipynb |    4 +-
 .../03.06.01-Cooperative.ipynb                |    4 +-
 ...06.02-Exercise-Cooperative-Histogram.ipynb |    4 +-
 .../01__cutile_python_intro__vector_add.ipynb |   15 +-
 tutorials/cuda-tile/notebooks/start.ipynb     |   15 +-
 .../notebooks/01_kernel_fusion.ipynb          |    8 +-
 .../notebooks/01_kernel_fusion_SOLUTION.ipynb |   10 +-
 .../notebooks/02_mem_exec_spaces.ipynb        |    8 +-
 .../02_mem_exec_spaces_SOLUTION.ipynb         |   10 +-
 .../notebooks/03_stateful_api.ipynb           |    8 +-
 .../notebooks/03_stateful_api_SOLUTION.ipynb  |   10 +-
 .../notebooks/04_callbacks.ipynb              |    8 +-
 .../notebooks/04_callbacks_SOLUTION.ipynb     |   10 +-
 .../notebooks/05_device_api.ipynb             |    8 +-
 .../notebooks/05_device_api_SOLUTION.ipynb    |   10 +-
 .../notebooks/06_sparse_solver.ipynb          |   10 +-
 .../notebooks/06_sparse_solver_SOLUTION.ipynb |   10 +-
 .../notebooks/cpp/lab1_daxpy/daxpy.ipynb      |   10 +-
 .../notebooks/cpp/lab1_select/select.ipynb    |   10 +-
 .../stdpar/notebooks/cpp/lab2_heat/heat.ipynb |   10 +-
 .../stdpar/notebooks/cpp/lab3_tree/tree.ipynb |   10 +-
 tutorials/stdpar/notebooks/cpp/start.ipynb    |   10 +-
 .../fortran/lab1_matmul/matmul.ipynb          |   10 +-
 .../notebooks/fortran/lab2_daxpy/daxpy.ipynb  |   10 +-
 .../notebooks/fortran/lab3_heat/heat.ipynb    |   10 +-
 .../stdpar/notebooks/fortran/start.ipynb      |   10 +-
 tutorials/stdpar/notebooks/start.ipynb        |   10 +-
 121 files changed, 6790 insertions(+), 10817 deletions(-)
 create mode 100644 .github/workflows/pr-comment-notebook-format.yml
 create mode 100644 .github/workflows/test-notebook-format.yml
 create mode 100755 brev/test-notebook-format.py

diff --git a/.github/workflows/pr-comment-notebook-format.yml b/.github/workflows/pr-comment-notebook-format.yml
new file mode 100644
index 00000000..23bf2bf3
--- /dev/null
+++ b/.github/workflows/pr-comment-notebook-format.yml
@@ -0,0 +1,100 @@
+name: PR Comment on Notebook Format Check
+
+on:
+  workflow_run:
+    workflows: ["Test Notebook Format"]
+    types: [completed]
+
+# Write permissions for commenting
+permissions:
+  pull-requests: write
+
+jobs:
+  comment:
+    runs-on: ubuntu-latest
+    if: github.event.workflow_run.event == 'pull_request' && github.event.workflow_run.conclusion == 'failure'
+
+    steps:
+      - name: Download PR comment data
+        uses: actions/download-artifact@v4
+        with:
+          name: pr-comment-data
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          run-id: ${{ github.event.workflow_run.id }}
+
+      - name: Post comment on PR
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+
+            // Read PR number from artifact
+            const prNumber = parseInt(fs.readFileSync('pr_number', 'utf8').trim());
+
+            if (!prNumber || isNaN(prNumber)) {
+              console.log('No valid PR number found, skipping comment.');
+              return;
+            }
+
+            const runUrl = `${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${{ github.event.workflow_run.id }}`;
+
+            const commentBody = `## ❌ Notebook Format Check Failed
+
+            **One or more Jupyter notebooks have format or metadata issues.**
+
+            Please check the [workflow run logs](${runUrl}) for details on which notebooks have issues.
+
+            ### What is checked:
+
+            1. **Schema integrity** — notebooks must be valid according to the Jupyter notebook JSON schema
+            2. **Metadata conformance** — notebooks must have the standard metadata block (accelerator, colab, kernelspec, language_info)
+            3. **Clean outputs** — non-SOLUTION notebooks must have outputs, execution counts, and execution timing metadata cleared
+
+            ### How to fix:
+
+            \`\`\`bash
+            # Check all tutorials
+            python3 brev/test-notebook-format.py
+
+            # Check a specific tutorial
+            python3 brev/test-notebook-format.py <tutorial-name>
+
+            # Auto-fix all issues
+            python3 brev/test-notebook-format.py --fix
+
+            # Auto-fix a specific tutorial
+            python3 brev/test-notebook-format.py <tutorial-name> --fix
+            \`\`\`
+            `;
+
+            // Check if we already commented on this PR to avoid spam
+            const { data: comments } = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: prNumber
+            });
+
+            const botComment = comments.find(comment =>
+              comment.user.type === 'Bot' &&
+              comment.body.includes('Notebook Format Check Failed')
+            );
+
+            if (botComment) {
+              // Update existing comment
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: botComment.id,
+                body: commentBody
+              });
+              console.log(`Updated existing comment on PR #${prNumber}`);
+            } else {
+              // Create new comment
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: prNumber,
+                body: commentBody
+              });
+              console.log(`Created comment on PR #${prNumber}`);
+            }
diff --git a/.github/workflows/test-notebook-format.yml b/.github/workflows/test-notebook-format.yml
new file mode 100644
index 00000000..1574d65e
--- /dev/null
+++ b/.github/workflows/test-notebook-format.yml
@@ -0,0 +1,47 @@
+name: Test Notebook Format
+
+on:
+  push:
+    branches:
+      - '**'
+  pull_request:
+    types: [opened, reopened, synchronize]
+
+# Minimal permissions - only read access needed for checks
+permissions:
+  contents: read
+
+jobs:
+  test-notebook-format:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.13'
+
+      - name: Install nbformat
+        run: pip install nbformat
+
+      - name: Check notebook format
+        id: notebook-format-check
+        run: python3 brev/test-notebook-format.py
+
+      - name: Save PR number and result
+        if: always() && github.event_name == 'pull_request'
+        run: |
+          mkdir -p ./pr-comment-data
+          echo '${{ github.event.pull_request.number }}' > ./pr-comment-data/pr_number
+          echo '${{ steps.notebook-format-check.outcome }}' > ./pr-comment-data/outcome
+
+      - name: Upload PR comment data
+        if: always() && github.event_name == 'pull_request'
+        uses: actions/upload-artifact@v4
+        with:
+          name: pr-comment-data
+          path: pr-comment-data/
+          retention-days: 1
diff --git a/Accelerated_Python_User_Guide/notebooks/Chapter_01_GPU_Computing_Basics.ipynb b/Accelerated_Python_User_Guide/notebooks/Chapter_01_GPU_Computing_Basics.ipynb
index 01fb5faf..5ee52a6d 100644
--- a/Accelerated_Python_User_Guide/notebooks/Chapter_01_GPU_Computing_Basics.ipynb
+++ b/Accelerated_Python_User_Guide/notebooks/Chapter_01_GPU_Computing_Basics.ipynb
@@ -222,6 +222,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
diff --git a/Accelerated_Python_User_Guide/notebooks/Chapter_02_Brief_Intro_to_CUDA.ipynb b/Accelerated_Python_User_Guide/notebooks/Chapter_02_Brief_Intro_to_CUDA.ipynb
index ce1d8769..8ee19977 100644
--- a/Accelerated_Python_User_Guide/notebooks/Chapter_02_Brief_Intro_to_CUDA.ipynb
+++ b/Accelerated_Python_User_Guide/notebooks/Chapter_02_Brief_Intro_to_CUDA.ipynb
@@ -124,6 +124,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
diff --git a/Accelerated_Python_User_Guide/notebooks/Chapter_03_Python_on_the_GPU.ipynb b/Accelerated_Python_User_Guide/notebooks/Chapter_03_Python_on_the_GPU.ipynb
index 107c0f07..7c592f3a 100644
--- a/Accelerated_Python_User_Guide/notebooks/Chapter_03_Python_on_the_GPU.ipynb
+++ b/Accelerated_Python_User_Guide/notebooks/Chapter_03_Python_on_the_GPU.ipynb
@@ -85,6 +85,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
diff --git a/Accelerated_Python_User_Guide/notebooks/Chapter_04_Scientific_Computing_with_CuPy.ipynb b/Accelerated_Python_User_Guide/notebooks/Chapter_04_Scientific_Computing_with_CuPy.ipynb
index 8e842c40..ace77f5f 100644
--- a/Accelerated_Python_User_Guide/notebooks/Chapter_04_Scientific_Computing_with_CuPy.ipynb
+++ b/Accelerated_Python_User_Guide/notebooks/Chapter_04_Scientific_Computing_with_CuPy.ipynb
@@ -360,6 +360,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
diff --git a/Accelerated_Python_User_Guide/notebooks/Chapter_05_CUDA_Kernels_with_Numba.ipynb b/Accelerated_Python_User_Guide/notebooks/Chapter_05_CUDA_Kernels_with_Numba.ipynb
index 8d5f6659..bf93d9c1 100644
--- a/Accelerated_Python_User_Guide/notebooks/Chapter_05_CUDA_Kernels_with_Numba.ipynb
+++ b/Accelerated_Python_User_Guide/notebooks/Chapter_05_CUDA_Kernels_with_Numba.ipynb
@@ -306,6 +306,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
diff --git a/Accelerated_Python_User_Guide/notebooks/Chapter_06_Intro_to_nvmath-python.ipynb b/Accelerated_Python_User_Guide/notebooks/Chapter_06_Intro_to_nvmath-python.ipynb
index aabb1503..ec56d208 100644
--- a/Accelerated_Python_User_Guide/notebooks/Chapter_06_Intro_to_nvmath-python.ipynb
+++ b/Accelerated_Python_User_Guide/notebooks/Chapter_06_Intro_to_nvmath-python.ipynb
@@ -492,8 +492,14 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -507,7 +513,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/Accelerated_Python_User_Guide/notebooks/Chapter_07_Intro_to_cuDF.ipynb b/Accelerated_Python_User_Guide/notebooks/Chapter_07_Intro_to_cuDF.ipynb
index 668585f7..43f9103b 100644
--- a/Accelerated_Python_User_Guide/notebooks/Chapter_07_Intro_to_cuDF.ipynb
+++ b/Accelerated_Python_User_Guide/notebooks/Chapter_07_Intro_to_cuDF.ipynb
@@ -335,6 +335,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
diff --git a/Accelerated_Python_User_Guide/notebooks/Chapter_08_Intro_to_cuML.ipynb b/Accelerated_Python_User_Guide/notebooks/Chapter_08_Intro_to_cuML.ipynb
index 4e355051..9beb6b6b 100644
--- a/Accelerated_Python_User_Guide/notebooks/Chapter_08_Intro_to_cuML.ipynb
+++ b/Accelerated_Python_User_Guide/notebooks/Chapter_08_Intro_to_cuML.ipynb
@@ -436,6 +436,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
diff --git a/Accelerated_Python_User_Guide/notebooks/Chapter_09_Intro_to_cuGraph.ipynb b/Accelerated_Python_User_Guide/notebooks/Chapter_09_Intro_to_cuGraph.ipynb
index 580b8fa7..49e23ae8 100644
--- a/Accelerated_Python_User_Guide/notebooks/Chapter_09_Intro_to_cuGraph.ipynb
+++ b/Accelerated_Python_User_Guide/notebooks/Chapter_09_Intro_to_cuGraph.ipynb
@@ -595,6 +595,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -614,5 +620,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/Accelerated_Python_User_Guide/notebooks/Chapter_10_Developer_Tools.ipynb b/Accelerated_Python_User_Guide/notebooks/Chapter_10_Developer_Tools.ipynb
index c5c18bf6..43c869e0 100644
--- a/Accelerated_Python_User_Guide/notebooks/Chapter_10_Developer_Tools.ipynb
+++ b/Accelerated_Python_User_Guide/notebooks/Chapter_10_Developer_Tools.ipynb
@@ -208,21 +208,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "629cb5c6-3c72-4658-b3cb-ea9352c89ee6",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "init step...\n",
-      "sort step...\n",
-      "dot step...\n",
-      "done\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import sys\n",
     "import cupy as cp\n",
@@ -766,6 +755,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
diff --git a/Accelerated_Python_User_Guide/notebooks/Chapter_11_Distributed_Computing_cuPyNumeric.ipynb b/Accelerated_Python_User_Guide/notebooks/Chapter_11_Distributed_Computing_cuPyNumeric.ipynb
index 33a2c0ae..8e992de5 100644
--- a/Accelerated_Python_User_Guide/notebooks/Chapter_11_Distributed_Computing_cuPyNumeric.ipynb
+++ b/Accelerated_Python_User_Guide/notebooks/Chapter_11_Distributed_Computing_cuPyNumeric.ipynb
@@ -1516,6 +1516,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -1531,7 +1537,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.7"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/Accelerated_Python_User_Guide/notebooks/Chapter_12.1_IsingModel_In_Warp.ipynb b/Accelerated_Python_User_Guide/notebooks/Chapter_12.1_IsingModel_In_Warp.ipynb
index b090ffb8..f0cfe97d 100644
--- a/Accelerated_Python_User_Guide/notebooks/Chapter_12.1_IsingModel_In_Warp.ipynb
+++ b/Accelerated_Python_User_Guide/notebooks/Chapter_12.1_IsingModel_In_Warp.ipynb
@@ -96,7 +96,7 @@
     "\n",
     "# Check for GPU availability\n",
     "if wp.get_cuda_device_count() > 0:\n",
-    "    print(\"\u2713 GPU detected successfully\")\n",
+    "    print(\"✓ GPU detected successfully\")\n",
     "else:\n",
     "    print(\"No GPU detected!\")"
    ]
@@ -112,7 +112,7 @@
     "\n",
     "Now, we will apply these concepts to implement the 2-D Ising model.\n",
     "\n",
-    "The Ising model is particularly instructive because it highlights both the power and challenges of GPU parallelization. While the algorithm seems embarrassingly parallel at first glance (each lattice site can be updated independently), the devil is in the details \u2014 neighboring sites interact, creating potential race conditions that must be carefully managed.\n",
+    "The Ising model is particularly instructive because it highlights both the power and challenges of GPU parallelization. While the algorithm seems embarrassingly parallel at first glance (each lattice site can be updated independently), the devil is in the details — neighboring sites interact, creating potential race conditions that must be carefully managed.\n",
     "\n",
     "Let us begin by understanding the physics behind the Ising model, then progressively build our GPU implementation, learning from common pitfalls along the way.\n"
    ]
@@ -130,7 +130,7 @@
     "\n",
     "### Numerical setup\n",
     "\n",
-    "Imagine a two-dimensional grid where each site contains a **magnetic spin** that can point either \"up\" (\u2191) or \"down\" (\u2193). In mathematical terms, each spin $\\sigma_i$ takes the value $+1$ (up) or $-1$ (down). This seemingly simple setup gives rise to rich dynamics as temperature changes:\n",
+    "Imagine a two-dimensional grid where each site contains a **magnetic spin** that can point either \"up\" (↑) or \"down\" (↓). In mathematical terms, each spin $\\sigma_i$ takes the value $+1$ (up) or $-1$ (down). This seemingly simple setup gives rise to rich dynamics as temperature changes:\n",
     "\n",
     "- At **low temperatures**: Spins tend to align with their neighbors, creating coherent regions\n",
     "- At **high temperatures**: Thermal fluctuations dominate, leading to random spins and disordered regions\n",
@@ -245,15 +245,15 @@
     "    \"\"\"\n",
     "    Initialize a square lattice with random spin orientations.\n",
     "\n",
-    "    Creates an L\u00d7L grid where each site contains a magnetic spin that can be\n",
+    "    Creates an L×L grid where each site contains a magnetic spin that can be\n",
     "    either +1 (spin up) or -1 (spin down). The initial configuration is\n",
     "    random.\n",
     "\n",
     "    Args:\n",
-    "        L (int): Linear size of the lattice (creates L\u00d7L grid)\n",
+    "        L (int): Linear size of the lattice (creates L×L grid)\n",
     "\n",
     "    Returns:\n",
-    "        list[list[int]]: 2-D lattice where lattice[i][j] \u2208 {-1, +1}\n",
+    "        list[list[int]]: 2-D lattice where lattice[i][j] ∈ {-1, +1}\n",
     "    \"\"\"\n",
     "    lattice = [[random.choice([-1, 1]) for _ in range(L)] for _ in range(L)]\n",
     "    return lattice\n",
@@ -267,7 +267,7 @@
     "    lattice exactly once. We visit sites in random order to avoid\n",
     "    systematic biases that could arise from sequential scanning.\n",
     "\n",
-    "    For each site, we calculate the energy change \u0394E that would result from\n",
+    "    For each site, we calculate the energy change ΔE that would result from\n",
     "    flipping the spin, then accept or reject the flip based on the Boltzmann\n",
     "    probability\n",
     "\n",
@@ -279,22 +279,22 @@
     "    Args:\n",
     "        lattice (list[list[int]]): Current spin configuration to update in-place\n",
     "        L (int): lattice size in terms of number of grid points along one dimension\n",
-    "        T (float): Temperature, T_critical \u2248 2.269 for 2-D Ising model when k_B = J = 1\n",
+    "        T (float): Temperature, T_critical ≈ 2.269 for 2-D Ising model when k_B = J = 1\n",
     "\n",
     "    Algorithm Details:\n",
     "        1. Create list of all (i,j) lattice coordinates\n",
     "        2. Randomly shuffle the update order to avoid artifacts\n",
     "        3. For each site (i,j):\n",
     "           a. Calculate sum of 4 nearest neighbors using periodic boundaries\n",
-    "           b. Compute energy change: \u0394E = 2 * J * \u03c3_ij * \u03a3_neighbors\n",
-    "           c. Accept flip with probability P = exp(-\u0394E/T)\n",
+    "           b. Compute energy change: ΔE = 2 * J * σ_ij * Σ_neighbors\n",
+    "           c. Accept flip with probability P = exp(-ΔE/T)\n",
     "           d. Update lattice[i][j] *= -1 if flip accepted\n",
     "\n",
     "    Note:\n",
-    "        The acceptance ratio simplifies to exp(-2 * \u03b2 * \u03c3_ij * neighbor_sum)\n",
+    "        The acceptance ratio simplifies to exp(-2 * β * σ_ij * neighbor_sum)\n",
     "        because we only consider the energy difference, not absolute energy.\n",
     "    \"\"\"\n",
-    "    # Convert temperature to inverse temperature (\u03b2 = 1/T)\n",
+    "    # Convert temperature to inverse temperature (β = 1/T)\n",
     "    # kB = 1\n",
     "    beta = 1.0 / T\n",
     "\n",
@@ -317,9 +317,9 @@
     "            + lattice[i][(j + 1) % L]  # neighbor right\n",
     "        )\n",
     "\n",
-    "        # Energy change from flipping spin: \u0394E = -J * (\u03c3_new - \u03c3_old) * \u03a3_neighbors\n",
-    "        # Since \u03c3_new = -\u03c3_old, we get: \u0394E = 2 * J * \u03c3_ij * \u03a3_neighbors\n",
-    "        # Acceptance probability: P = exp(-\u03b2 * \u0394E) = exp(-\u03b2 * 2 * \u03c3_ij * \u03a3_neighbors)\n",
+    "        # Energy change from flipping spin: ΔE = -J * (σ_new - σ_old) * Σ_neighbors\n",
+    "        # Since σ_new = -σ_old, we get: ΔE = 2 * J * σ_ij * Σ_neighbors\n",
+    "        # Acceptance probability: P = exp(-β * ΔE) = exp(-β * 2 * σ_ij * Σ_neighbors)\n",
     "        acceptance_ratio = math.exp(-beta * 2 * spin_ij * nn_sum)\n",
     "\n",
     "        # Accept flip if random number < acceptance probability\n",
@@ -357,9 +357,9 @@
     "LATTICE_SIZE = 256\n",
     "TEMPERATURE = 2.269  # Try: T=0.02 (ordered), T=2.269 (critical), T=20.0 (disordered)\n",
     "\n",
-    "print(f\"Simulating {LATTICE_SIZE}\u00d7{LATTICE_SIZE} Ising model at T={TEMPERATURE}\")\n",
+    "print(f\"Simulating {LATTICE_SIZE}×{LATTICE_SIZE} Ising model at T={TEMPERATURE}\")\n",
     "print(\n",
-    "    f\"Critical temperature T_c \u2248 2.269 (this run: {'below' if TEMPERATURE < 2.269 else 'above'} T_c)\"\n",
+    "    f\"Critical temperature T_c ≈ 2.269 (this run: {'below' if TEMPERATURE < 2.269 else 'above'} T_c)\"\n",
     ")\n",
     "\n",
     "# Initialize with random spins (mimics infinite temperature initial condition)\n",
@@ -441,7 +441,7 @@
     "TEMPERATURE = 20.0  # Try: T=0.02 (ordered), T=2.269 (critical), T=20.0 (disordered)\n",
     "\n",
     "print(\n",
-    "    f\"Magnetization analysis: {LATTICE_SIZE}\u00d7{LATTICE_SIZE} lattice at T={TEMPERATURE}\"\n",
+    "    f\"Magnetization analysis: {LATTICE_SIZE}×{LATTICE_SIZE} lattice at T={TEMPERATURE}\"\n",
     ")\n",
     "\n",
     "# Initialize with random configuration\n",
@@ -461,14 +461,14 @@
     "        L (int): Linear lattice size\n",
     "\n",
     "    Returns:\n",
-    "        float: Normalized magnetization M \u2208 [-1, +1]\n",
+    "        float: Normalized magnetization M ∈ [-1, +1]\n",
     "               M = +1: All spins up (perfect ferromagnetic order)\n",
     "               M = -1: All spins down (perfect ferromagnetic order)\n",
     "               M = 0: Equal numbers of up/down spins (disordered)\n",
     "\n",
     "    Physics Notes:\n",
-    "        - M is the thermal average \u27e8\u03a3\u1d62 \u03c3\u1d62\u27e9 / N in equilibrium\n",
-    "        - |M| \u2192 0 as T \u2192 T_c from below (continuous phase transition)\n",
+    "        - M is the thermal average ⟨Σᵢ σᵢ⟩ / N in equilibrium\n",
+    "        - |M| → 0 as T → T_c from below (continuous phase transition)\n",
     "        - M fluctuates around its equilibrium value due to thermal noise\n",
     "    \"\"\"\n",
     "    total_spin = sum(sum(row) for row in lattice)\n",
@@ -604,11 +604,11 @@
     "\n",
     "    Kernel launch:\n",
     "        wp.launch(generate_lattice, dim=(LATTICE_SIZE, LATTICE_SIZE), ...)\n",
-    "        Creates LATTICE_SIZE\u00b2 threads, one per lattice site\n",
+    "        Creates LATTICE_SIZE² threads, one per lattice site\n",
     "    \"\"\"\n",
     "\n",
     "    # Get this thread's 2-D coordinates within the lattice\n",
-    "    # wp.tid() returns (i, j) where i\u2208[0,LATTICE_SIZE), j\u2208[0,LATTICE_SIZE)\n",
+    "    # wp.tid() returns (i, j) where i∈[0,LATTICE_SIZE), j∈[0,LATTICE_SIZE)\n",
     "    i, j = wp.tid()\n",
     "\n",
     "    # Initialize random number generator state for this specific thread\n",
@@ -782,9 +782,9 @@
     "\n",
     "wp.launch(generate_lattice, lattice_0.shape, inputs=[lattice_0, 42])\n",
     "\n",
-    "print(f\"Simulating {LATTICE_SIZE}\u00d7{LATTICE_SIZE} Ising model at T={TEMPERATURE}\")\n",
+    "print(f\"Simulating {LATTICE_SIZE}×{LATTICE_SIZE} Ising model at T={TEMPERATURE}\")\n",
     "print(\n",
-    "    f\"Critical temperature T_c \u2248 2.269 (this run: {'below' if TEMPERATURE < 2.269 else 'above'} T_c)\"\n",
+    "    f\"Critical temperature T_c ≈ 2.269 (this run: {'below' if TEMPERATURE < 2.269 else 'above'} T_c)\"\n",
     ")\n",
     "\n",
     "# Set up visualization colormap\n",
@@ -849,12 +849,12 @@
     "\n",
     "Running the same lattice size and number of Monte Carlo steps as in the baseline Python implementation reveals significant performance improvements:\n",
     "\n",
-    "- **CPU implementation**: ~5.0 seconds for 200 Monte Carlo steps on a 256\u00d7256 lattice\n",
+    "- **CPU implementation**: ~5.0 seconds for 200 Monte Carlo steps on a 256×256 lattice\n",
     "- **GPU implementation**: ~0.2 seconds for the same workload\n",
     "\n",
     "This represents a **25x speedup** - a significant improvement that demonstrates the power of GPU parallelization for this simulation. The acceleration comes from processing all 65,536 lattice sites in parallel rather than sequentially.\n",
     "\n",
-    "However, the results are not correct! At the low temperature $T = 0.02$, the 2-D Ising model should exhibit strong ordering - we should see large domains of aligned spins with well-defined boundaries. Even at high temperatures, the results appear incorrect and differ significantly from our baseline Python implementation. This highlights the importance of maintaining a reference implementation when developing complex algorithms \u2014 it provides a reliable benchmark for validating correctness.\n",
+    "However, the results are not correct! At the low temperature $T = 0.02$, the 2-D Ising model should exhibit strong ordering - we should see large domains of aligned spins with well-defined boundaries. Even at high temperatures, the results appear incorrect and differ significantly from our baseline Python implementation. This highlights the importance of maintaining a reference implementation when developing complex algorithms — it provides a reliable benchmark for validating correctness.\n",
     "\n",
     "The lattice configuration appears unusually fragmented and disordered. This unexpected behavior suggests that our seemingly straightforward parallelization approach may have introduced subtle but significant algorithmic issues.\n",
     "\n",
@@ -1112,7 +1112,7 @@
    "source": [
     "### Checkerboard neighbor indexing logic\n",
     "\n",
-    "When we decompose our $N\u00d7N$ lattice into two $N\u00d7(N/2)$ sublattices (black and white), finding horizontal neighbors becomes a bit more complex. Finding top and bottom neighbors is still relatively straightforward (we just need to keep wrapping around the corners for periodic boundary conditions).\n",
+    "When we decompose our $N×N$ lattice into two $N×(N/2)$ sublattices (black and white), finding horizontal neighbors becomes a bit more complex. Finding top and bottom neighbors is still relatively straightforward (we just need to keep wrapping around the corners for periodic boundary conditions).\n",
     "\n",
     "Each cell in a decomposed sublattice has **two horizontal neighbors** in the opposite colored sublattice:\n",
     "- A **left neighbor** \n",
@@ -1257,11 +1257,11 @@
    "source": [
     "@wp.kernel\n",
     "def update_lattice(\n",
-    "    beta: float,  # Inverse temperature parameter (\u03b2 = 1/kT)\n",
+    "    beta: float,  # Inverse temperature parameter (β = 1/kT)\n",
     "    rng_seed: int,  # Random seed for Monte Carlo sampling\n",
     "    is_black: bool,  # True if updating black sublattice, False if updating white sublattice\n",
-    "    op_lattice: wp.array2d(dtype=wp.int8),  # Opposite color sublattice (N \u00d7 N/2)\n",
-    "    lattice: wp.array2d(dtype=wp.int8),  # Current color sublattice being updated (N \u00d7 N/2)\n",
+    "    op_lattice: wp.array2d(dtype=wp.int8),  # Opposite color sublattice (N × N/2)\n",
+    "    lattice: wp.array2d(dtype=wp.int8),  # Current color sublattice being updated (N × N/2)\n",
     "):\n",
     "\n",
     "    # Get thread coordinates in the current sublattice\n",
@@ -1357,9 +1357,9 @@
     "lattice_w = wp.empty_like(lattice_b)\n",
     "combined_lattice = wp.empty((LATTICE_SIZE, LATTICE_SIZE), dtype=wp.int8)\n",
     "\n",
-    "print(f\"Simulating {LATTICE_SIZE}\u00d7{LATTICE_SIZE} Ising model at T={TEMPERATURE}\")\n",
+    "print(f\"Simulating {LATTICE_SIZE}×{LATTICE_SIZE} Ising model at T={TEMPERATURE}\")\n",
     "print(\n",
-    "    f\"Critical temperature T_c \u2248 2.269 (this run: {'below' if TEMPERATURE < 2.269 else 'above'} T_c)\"\n",
+    "    f\"Critical temperature T_c ≈ 2.269 (this run: {'below' if TEMPERATURE < 2.269 else 'above'} T_c)\"\n",
     ")\n",
     "\n",
     "\n",
@@ -1615,7 +1615,7 @@
     "    fmt=\"o--\",\n",
     "    capsize=5,\n",
     "    capthick=2,\n",
-    "    label=\"Simulation \u00b1 \u03c3\",\n",
+    "    label=\"Simulation ± σ\",\n",
     "    color=\"#76b900\",\n",
     ")\n",
     "\n",
@@ -1677,11 +1677,11 @@
   "colab": {
    "gpuType": "T4",
    "provenance": [],
-   "toc_visible": true,
-   "include_colab_link": true
+   "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
    "name": "python3"
   },
   "language_info": {
@@ -1694,9 +1694,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
-}
\ No newline at end of file
+ "nbformat_minor": 5
+}
diff --git a/Accelerated_Python_User_Guide/notebooks/Chapter_12_Intro_to_NVIDIA_Warp.ipynb b/Accelerated_Python_User_Guide/notebooks/Chapter_12_Intro_to_NVIDIA_Warp.ipynb
index 973d36a4..7132a4b1 100644
--- a/Accelerated_Python_User_Guide/notebooks/Chapter_12_Intro_to_NVIDIA_Warp.ipynb
+++ b/Accelerated_Python_User_Guide/notebooks/Chapter_12_Intro_to_NVIDIA_Warp.ipynb
@@ -1474,7 +1474,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -1488,9 +1488,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 5
 }
diff --git a/Accelerated_Python_User_Guide/notebooks/Chapter_cuda.cccl.parallel.ipynb b/Accelerated_Python_User_Guide/notebooks/Chapter_cuda.cccl.parallel.ipynb
index 255d681f..9579e75f 100644
--- a/Accelerated_Python_User_Guide/notebooks/Chapter_cuda.cccl.parallel.ipynb
+++ b/Accelerated_Python_User_Guide/notebooks/Chapter_cuda.cccl.parallel.ipynb
@@ -2999,8 +2999,28 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/Accelerated_Python_User_Guide/notebooks/Chapter_cuda.core.ipynb b/Accelerated_Python_User_Guide/notebooks/Chapter_cuda.core.ipynb
index a37c19ec..7bc0c33c 100644
--- a/Accelerated_Python_User_Guide/notebooks/Chapter_cuda.core.ipynb
+++ b/Accelerated_Python_User_Guide/notebooks/Chapter_cuda.core.ipynb
@@ -67,16 +67,7 @@
    "execution_count": null,
    "id": "a0771178",
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "SyntaxError",
-     "evalue": "invalid syntax (2111655890.py, line 1)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;36m  Cell \u001b[0;32mIn[1], line 1\u001b[0;36m\u001b[0m\n\u001b[0;31m    python3 -m pip install cuda-python numpy\u001b[0m\n\u001b[0m               ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "pip install cuda-core numpy"
    ]
@@ -1691,8 +1682,14 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -1706,7 +1703,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.2"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/Accelerated_Python_User_Guide/notebooks/Chapter_nvshmem4py.ipynb b/Accelerated_Python_User_Guide/notebooks/Chapter_nvshmem4py.ipynb
index 1c73e659..d4745bd7 100644
--- a/Accelerated_Python_User_Guide/notebooks/Chapter_nvshmem4py.ipynb
+++ b/Accelerated_Python_User_Guide/notebooks/Chapter_nvshmem4py.ipynb
@@ -556,6 +556,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -571,7 +577,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/Accelerated_Python_User_Guide/notebooks/Chapter_nvshmem4py_device.ipynb b/Accelerated_Python_User_Guide/notebooks/Chapter_nvshmem4py_device.ipynb
index 5e56eb8f..53a6075b 100644
--- a/Accelerated_Python_User_Guide/notebooks/Chapter_nvshmem4py_device.ipynb
+++ b/Accelerated_Python_User_Guide/notebooks/Chapter_nvshmem4py_device.ipynb
@@ -359,8 +359,28 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/brev/test-notebook-format.py b/brev/test-notebook-format.py
new file mode 100755
index 00000000..5300e334
--- /dev/null
+++ b/brev/test-notebook-format.py
@@ -0,0 +1,398 @@
+#!/usr/bin/env python3
+"""
+Test Jupyter notebook format integrity and metadata.
+
+This script performs three checks on every notebook:
+  1. Structural integrity: validates the notebook against the official Jupyter
+     notebook JSON schema using nbformat.
+  2. Metadata conformance: verifies that the top-level metadata, nbformat, and
+     nbformat_minor fields match the expected values.
+  3. Clean outputs: non-SOLUTION notebooks must have all cell outputs,
+     execution counts, and execution timing metadata cleared.
+
+The cuDF kernelspec is accepted as an alternative to the default ipykernel.
+If a notebook has any other kernelspec (or none), it is treated as incorrect
+and --fix will replace it with the default.
+
+Usage:
+  ./brev/test-notebook-format.py                       # check all tutorials
+  ./brev/test-notebook-format.py <tutorial-name>       # check one tutorial
+  ./brev/test-notebook-format.py <tutorial-name> --fix # check and fix one tutorial
+  ./brev/test-notebook-format.py --fix                 # check and fix all tutorials
+
+Examples:
+  ./brev/test-notebook-format.py
+  ./brev/test-notebook-format.py accelerated-python
+  ./brev/test-notebook-format.py cuda-cpp --fix
+"""
+
+import argparse
+import json
+import sys
+import warnings
+from pathlib import Path
+
+import nbformat
+
+# Standard metadata expected for all notebooks.
+STANDARD_METADATA = {
+    "accelerator": "GPU",
+    "colab": {
+        "gpuType": "T4",
+        "provenance": [],
+        "toc_visible": True,
+    },
+    "kernelspec": {
+        "display_name": "Python 3 (ipykernel)",
+        "language": "python",
+        "name": "python3",
+    },
+    "language_info": {
+        "codemirror_mode": {
+            "name": "ipython",
+            "version": 3,
+        },
+        "file_extension": ".py",
+        "mimetype": "text/x-python",
+        "name": "python",
+        "nbconvert_exporter": "python",
+        "pygments_lexer": "ipython3",
+        "version": "3.11.7",
+    },
+}
+
+STANDARD_NBFORMAT = 4
+STANDARD_NBFORMAT_MINOR = 5
+
+# The cuDF kernelspec is accepted as an alternative to the default.
+CUDF_KERNELSPEC = {
+    "display_name": "Python 3 (RAPIDS 25.10)",
+    "language": "python",
+    "name": "cudf-cu13-25.10",
+}
+
+# ANSI colors
+RED = "\033[0;31m"
+GREEN = "\033[0;32m"
+YELLOW = "\033[1;33m"
+NC = "\033[0m"  # No Color
+
+
+def has_cudf_kernelspec(metadata: dict) -> bool:
+    """Check if a notebook's metadata contains the cuDF kernelspec."""
+    return metadata.get("kernelspec") == CUDF_KERNELSPEC
+
+
+def get_expected_metadata(metadata: dict) -> dict:
+    """
+    Return the expected metadata dict for a notebook.
+
+    If the notebook already has the cuDF kernelspec, it is preserved.
+    Otherwise the default kernelspec is expected.
+    """
+    expected = dict(STANDARD_METADATA)
+    if has_cudf_kernelspec(metadata):
+        expected = dict(expected)
+        expected["kernelspec"] = dict(CUDF_KERNELSPEC)
+    return expected
+
+
+def diff_metadata(actual: dict, expected: dict, path: str = "") -> list[str]:
+    """
+    Recursively compare actual metadata against expected metadata.
+
+    Returns a list of human-readable difference descriptions.
+    """
+    diffs = []
+    prefix = f"{path}." if path else ""
+
+    # Check for missing keys
+    for key in expected:
+        if key not in actual:
+            diffs.append(f"  Missing key: {prefix}{key}")
+        elif isinstance(expected[key], dict) and isinstance(actual[key], dict):
+            diffs.extend(diff_metadata(actual[key], expected[key], f"{prefix}{key}"))
+        elif actual[key] != expected[key]:
+            diffs.append(
+                f"  Wrong value for {prefix}{key}: "
+                f"got {json.dumps(actual[key])}, "
+                f"expected {json.dumps(expected[key])}"
+            )
+
+    # Check for extra keys
+    for key in actual:
+        if key not in expected:
+            diffs.append(f"  Extra key: {prefix}{key}")
+
+    return diffs
+
+
+def is_solution_notebook(notebook_path: Path) -> bool:
+    """Check if a notebook is a SOLUTION notebook (filename contains SOLUTION)."""
+    return "SOLUTION" in notebook_path.name
+
+
+def check_clean_outputs(notebook: dict) -> list[str]:
+    """
+    Check that code cells have no outputs, execution counts, or execution
+    timing metadata (from the jupyterlab-execute-time plugin).
+
+    Returns a list of problem descriptions (empty if clean).
+    """
+    problems = []
+    for i, cell in enumerate(notebook.get("cells", [])):
+        if cell.get("cell_type") != "code":
+            continue
+        if cell.get("outputs"):
+            problems.append(f"  Cell {i} has non-empty outputs")
+        if cell.get("execution_count") is not None:
+            problems.append(f"  Cell {i} has execution_count={cell['execution_count']}")
+        if "execution" in cell.get("metadata", {}):
+            problems.append(f"  Cell {i} has execution timing metadata")
+    return problems
+
+
+def strip_outputs(notebook: dict) -> None:
+    """Clear outputs, execution counts, and execution timing metadata from all
+    code cells in-place."""
+    for cell in notebook.get("cells", []):
+        if cell.get("cell_type") != "code":
+            continue
+        cell["outputs"] = []
+        cell["execution_count"] = None
+        cell.get("metadata", {}).pop("execution", None)
+
+
+def validate_notebook_schema(notebook_path: Path) -> list[str]:
+    """
+    Validate a notebook against the official Jupyter notebook JSON schema.
+
+    Returns a list of validation error messages (empty if valid).
+    """
+    errors = []
+    try:
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            nb = nbformat.read(str(notebook_path), as_version=4)
+            nbformat.validate(nb)
+    except nbformat.ValidationError as e:
+        errors.append(f"  Schema validation error: {e.message}")
+    except Exception as e:
+        errors.append(f"  Failed to read notebook: {e}")
+    return errors
+
+
+def check_notebook(notebook_path: Path, fix: bool) -> bool:
+    """
+    Check a single notebook's format and metadata.
+
+    Returns True if the notebook passes all checks.
+    If fix=True, corrects the metadata in-place.
+    """
+    # Phase 1: Validate notebook structure against the JSON schema.
+    schema_errors = validate_notebook_schema(notebook_path)
+
+    # Phase 2: Check metadata conformance.
+    try:
+        with open(notebook_path, "r", encoding="utf-8") as f:
+            notebook = json.load(f)
+    except (json.JSONDecodeError, OSError) as e:
+        print(f"{RED}✗{NC} {notebook_path}")
+        print(f"  Failed to read notebook: {e}")
+        return False
+
+    actual_metadata = notebook.get("metadata", {})
+    expected_metadata = get_expected_metadata(actual_metadata)
+    actual_nbformat = notebook.get("nbformat")
+    actual_nbformat_minor = notebook.get("nbformat_minor")
+
+    tag = "cudf" if has_cudf_kernelspec(actual_metadata) else "standard"
+
+    problems = []
+
+    # Add schema errors
+    problems.extend(schema_errors)
+
+    # Check metadata
+    metadata_diffs = diff_metadata(actual_metadata, expected_metadata, "metadata")
+    if metadata_diffs:
+        problems.extend(metadata_diffs)
+
+    # Check nbformat
+    if actual_nbformat != STANDARD_NBFORMAT:
+        problems.append(
+            f"  Wrong nbformat: got {actual_nbformat}, expected {STANDARD_NBFORMAT}"
+        )
+
+    # Check nbformat_minor
+    if actual_nbformat_minor != STANDARD_NBFORMAT_MINOR:
+        problems.append(
+            f"  Wrong nbformat_minor: got {actual_nbformat_minor}, "
+            f"expected {STANDARD_NBFORMAT_MINOR}"
+        )
+
+    # Phase 3: Non-SOLUTION notebooks must have clean outputs.
+    if not is_solution_notebook(notebook_path):
+        output_problems = check_clean_outputs(notebook)
+        if output_problems:
+            problems.extend(output_problems)
+
+    if not problems:
+        print(f"{GREEN}✓{NC} {notebook_path} ({tag})")
+        return True
+
+    # There are problems
+    print(f"{RED}✗{NC} {notebook_path} ({tag})")
+    for problem in problems:
+        print(f"  {problem}")
+
+    if fix:
+        notebook["metadata"] = expected_metadata
+        notebook["nbformat"] = STANDARD_NBFORMAT
+        notebook["nbformat_minor"] = STANDARD_NBFORMAT_MINOR
+
+        if not is_solution_notebook(notebook_path):
+            strip_outputs(notebook)
+
+        with open(notebook_path, "w", encoding="utf-8") as f:
+            json.dump(notebook, f, indent=1, ensure_ascii=False)
+            f.write("\n")
+
+        print(f"  {GREEN}→ Fixed{NC}")
+
+    return False
+
+
+def find_notebook_dirs(repo_root: Path) -> list[Path]:
+    """
+    Return all directories that should be checked for notebooks.
+
+    This includes every subdirectory under tutorials/ and the
+    Accelerated_Python_User_Guide directory (if it exists).
+    """
+    dirs = []
+
+    tutorials_root = repo_root / "tutorials"
+    if tutorials_root.is_dir():
+        dirs.extend(sorted(d for d in tutorials_root.iterdir() if d.is_dir()))
+
+    user_guide = repo_root / "Accelerated_Python_User_Guide"
+    if user_guide.is_dir():
+        dirs.append(user_guide)
+
+    return dirs
+
+
+def resolve_tutorial_path(tutorial_arg: str, repo_root: Path) -> Path:
+    """Resolve a tutorial argument to an absolute directory path."""
+    if "/" in tutorial_arg or Path(tutorial_arg).is_dir():
+        path = Path(tutorial_arg)
+        if not path.is_absolute():
+            path = repo_root / path
+        return path
+    return repo_root / "tutorials" / tutorial_arg
+
+
+def check_directory(dir_path: Path, repo_root: Path, fix: bool) -> tuple[int, int]:
+    """
+    Check all notebooks in a directory.
+
+    Returns (passed, failed) counts.
+    """
+    notebooks = sorted(dir_path.rglob("*.ipynb"))
+    notebooks = [nb for nb in notebooks if ".ipynb_checkpoints" not in str(nb)]
+
+    if not notebooks:
+        return 0, 0
+
+    try:
+        display_path = dir_path.relative_to(repo_root)
+    except ValueError:
+        display_path = dir_path
+    print(f"Checking notebook format in: {display_path}")
+    print()
+
+    passed = 0
+    failed = 0
+
+    for notebook_path in notebooks:
+        if check_notebook(notebook_path, fix):
+            passed += 1
+        else:
+            failed += 1
+
+    print()
+    return passed, failed
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Test Jupyter notebook format integrity and metadata."
+    )
+    parser.add_argument(
+        "tutorial",
+        nargs="?",
+        default=None,
+        help=(
+            'Tutorial name (e.g., "accelerated-python") or path to tutorial '
+            "directory. If omitted, all tutorials and the "
+            "Accelerated_Python_User_Guide are checked."
+        ),
+    )
+    parser.add_argument(
+        "--fix",
+        action="store_true",
+        help="Automatically correct metadata that does not match",
+    )
+
+    args = parser.parse_args()
+
+    # Resolve paths
+    script_dir = Path(__file__).resolve().parent
+    repo_root = script_dir.parent
+
+    if args.fix:
+        print(f"{YELLOW}Fix mode enabled: metadata will be corrected in-place{NC}")
+        print()
+
+    # Determine which directories to check
+    if args.tutorial is not None:
+        tutorial_path = resolve_tutorial_path(args.tutorial, repo_root)
+        if not tutorial_path.is_dir():
+            print(f"{RED}Error: Tutorial directory not found: {tutorial_path}{NC}")
+            sys.exit(1)
+        dirs_to_check = [tutorial_path]
+    else:
+        dirs_to_check = find_notebook_dirs(repo_root)
+        if not dirs_to_check:
+            print(f"{YELLOW}No tutorial directories found in {repo_root}{NC}")
+            sys.exit(0)
+
+    total_passed = 0
+    total_failed = 0
+
+    for dir_path in dirs_to_check:
+        passed, failed = check_directory(dir_path, repo_root, args.fix)
+        total_passed += passed
+        total_failed += failed
+
+    print("=" * 80)
+    if total_failed == 0:
+        print(
+            f"{GREEN}✅ All {total_passed} notebook(s) have correct format and "
+            f"metadata!{NC}"
+        )
+        return 0
+    else:
+        action = "fixed" if args.fix else "failed"
+        print(
+            f"{RED}❌ {total_failed} notebook(s) {action}, "
+            f"{total_passed} passed out of {total_passed + total_failed} total{NC}"
+        )
+        if not args.fix:
+            print(f"\nRun with --fix to automatically correct metadata.")
+        return 0 if args.fix else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tutorials/accelerated-python/notebooks/distributed/60__mpi4py.ipynb b/tutorials/accelerated-python/notebooks/distributed/60__mpi4py.ipynb
index b3a11000..1d198d51 100644
--- a/tutorials/accelerated-python/notebooks/distributed/60__mpi4py.ipynb
+++ b/tutorials/accelerated-python/notebooks/distributed/60__mpi4py.ipynb
@@ -563,8 +563,11 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
   "colab": {
-   "provenance": []
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
   },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
@@ -581,7 +584,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.5"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/accelerated-python/notebooks/distributed/61__dask.ipynb b/tutorials/accelerated-python/notebooks/distributed/61__dask.ipynb
index c72d65cb..0eea3e2d 100644
--- a/tutorials/accelerated-python/notebooks/distributed/61__dask.ipynb
+++ b/tutorials/accelerated-python/notebooks/distributed/61__dask.ipynb
@@ -401,7 +401,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/accelerated-python/notebooks/fundamentals/01__numpy_intro__ndarray_basics.ipynb b/tutorials/accelerated-python/notebooks/fundamentals/01__numpy_intro__ndarray_basics.ipynb
index 136e2b07..b3307395 100644
--- a/tutorials/accelerated-python/notebooks/fundamentals/01__numpy_intro__ndarray_basics.ipynb
+++ b/tutorials/accelerated-python/notebooks/fundamentals/01__numpy_intro__ndarray_basics.ipynb
@@ -1,354 +1,367 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "id": "d2e341ff-0c1e-40e8-8c33-9e3039de8013",
-      "metadata": {
-        "id": "d2e341ff-0c1e-40e8-8c33-9e3039de8013"
-      },
-      "source": [
-        "# NumPy `ndarray` Basics"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "a5ba6a1c",
-      "metadata": {},
-      "source": [
-        "## Table of Contents\n",
-        "\n",
-        "1. [The De Facto Standard for Array Data](#1.-The-De-Facto-Standard-for-Array-Data)\n",
-        "2. [Anatomy of an `ndarray`: Structure and Memory](#2.-Anatomy-of-an-`ndarray`:-Structure-and-Memory)\n",
-        "3. [Array Creation and Logical Views (Views vs. Copies)](#3.-Array-Creation-and-Logical-Views-(Views-vs.-Copies))\n",
-        "4. [Aggregations and Axes](#4.-Aggregations-and-Axes)\n",
-        "5. [Broadcasting: The \"Stretch\" Rule](#5.-Broadcasting:-The-\"Stretch\"-Rule)\n",
-        "6. [Why Vectorize? The Speed Advantage](#6.-Why-Vectorize?-The-Speed-Advantage)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "b30427de",
-      "metadata": {},
-      "source": [
-        "## 1. The De Facto Standard for Array Data\n",
-        "\n",
-        "NumPy is the foundational library for High Performance Computing (HPC) and Machine Learning (ML) in Python. Libraries like PyTorch, Pandas, and Scikit-learn are built upon or mirror the NumPy API. Learning NumPy is essential for mastering the Array Programming paradigm.\n",
-        "\n",
-        "NumPy provides the `ndarray` (N-dimensional array), a powerful, high-performance, and uniform container that enables highly efficient memory management, indexing, slicing, and, most importantly, vectorized arithmetic."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 1,
-      "id": "cc4596d8-d9ff-4c66-8822-246c0fc830c7",
-      "metadata": {
-        "id": "cc4596d8-d9ff-4c66-8822-246c0fc830c7"
-      },
-      "outputs": [],
-      "source": [
-        "import numpy as np"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "c59fce80",
-      "metadata": {},
-      "source": [
-        "## 2. Anatomy of an `ndarray`: Structure and Memory\n",
-        "\n",
-        "Unlike a standard Python list, an `ndarray` is a fixed-size, structured block of contiguous memory. Its efficiency comes from these four key, immutable properties:\n",
-        "\n",
-        "- **Data**: A pointer to the memory location holding the elements.\n",
-        "- **dtype**: The data type (e.g., `int32`, `float64`) which is uniform across all elements.\n",
-        "- **Shape**: A tuple defining the size along each dimension (e.g., $(100, 50)$ for 100 rows and 50 columns).\n",
-        "- **Strides**: The number of bytes to step in memory to reach the next element along each dimension—this is how NumPy efficiently handles different shapes and views.\n",
-        "\n",
-        "Let's explore these properties by creating a large dataset.\n",
-        "\n",
-        "---\n",
-        "\n",
-        "**Quick Docs**\n",
-        "- `np.arange(start, stop, step)`: Returns evenly spaced values in the half-open interval $[\\text{start}, \\text{stop})$.\n",
-        "- `arr.nbytes`: Total bytes consumed by the array's elements (in bytes).\n",
-        "- `arr.ndim`: The number of array dimensions (integer).\n",
-        "- `arr.size`: The total number of elements in the array (integer).\n",
-        "- `arr.shape`: The tuple of array dimensions.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "465e35bd",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Use a large number to clearly demonstrate the memory density of ndarrays\n",
-        "N = 50_000_000"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "5f1a613f-bc87-4950-b195-a66bb5bc05d3",
-      "metadata": {
-        "id": "5f1a613f-bc87-4950-b195-a66bb5bc05d3"
-      },
-      "outputs": [],
-      "source": [
-        "# TODO: Create the input data array with the numbers 1 to 50_000_000 (inclusive).\n",
-        "# Hint: np.arange generates values within a half-open interval [start, stop)\n",
-        "arr = ..."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "50530f2c-29bf-4061-8f84-bc5be00a5622",
-      "metadata": {
-        "id": "50530f2c-29bf-4061-8f84-bc5be00a5622"
-      },
-      "outputs": [],
-      "source": [
-        "# TODO: Calculate how large the array is in GB with nbytes.\n",
-        "# Hint: GB is 1e9 bytes. The .nbytes attribute returns the total bytes consumed by the elements.\n",
-        "# Note: This demonstrates that arrays are dense memory blocks, unlike pointer-heavy Python lists.\n",
-        "arr..."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "ffc15dad-e2fd-4b96-8b39-3496519d0656",
-      "metadata": {
-        "id": "ffc15dad-e2fd-4b96-8b39-3496519d0656"
-      },
-      "outputs": [],
-      "source": [
-        "# TODO: How many dimensions does the array have? (ndim)\n",
-        "arr..."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "b15cdf25-eb35-4926-b306-90ffd62b3d28",
-      "metadata": {
-        "id": "b15cdf25-eb35-4926-b306-90ffd62b3d28"
-      },
-      "outputs": [],
-      "source": [
-        "# TODO: How many elements does the array have? (size)\n",
-        "arr..."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "63887722-c9d7-405e-a019-e75646115541",
-      "metadata": {
-        "id": "63887722-c9d7-405e-a019-e75646115541"
-      },
-      "outputs": [],
-      "source": [
-        "# TODO: What is the shape of the array?\n",
-        "arr..."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "f5e58ee4",
-      "metadata": {},
-      "source": [
-        "## 3. Array Creation and Logical Views (Views vs. Copies)\n",
-        "\n",
-        "Arrays can logically represent data in many ways (e.g., 1D signal, 2D image, 4D video batch) independent of the underlying physical memory block.\n",
-        "\n",
-        "A critical performance feature is that operations like transposing or `reshape` often return a **View** instead of a **Copy**. A View only changes the metadata (`shape` and `strides`) without duplicating the physical data, making these operations nearly instantaneous.\n",
-        "\n",
-        "---\n",
-        "\n",
-        "**Quick Docs**\n",
-        "- `np.linspace(start, stop, num)`: Returns `num` evenly spaced samples, calculated over the interval $[\\text{start}, \\text{stop}]$.\n",
-        "- `np.random.default_rng().random(size)`: Returns random floats in $[0.0, 1.0)$. `size` can be a tuple.\n",
-        "- `arr.sort()`: Sorts an array in-place (modifies the original data). Use `np.sort(arr)` to return a sorted copy.\n",
-        "- `arr.reshape(new_shape)`: Returns a View with a new shape. One dimension can be -1, instructing NumPy to calculate the size automatically.\n",
-        "- `np.resize(arr, new_shape)`: Returns a new array with the specified shape. If the new shape is larger, it fills the new elements by repeating the original array.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "1527b4f6-5d75-47d4-97e0-d0e78bbc59f9",
-      "metadata": {
-        "id": "1527b4f6-5d75-47d4-97e0-d0e78bbc59f9"
-      },
-      "outputs": [],
-      "source": [
-        "# TODO: Create a new array with 5_000_000 elements containing equally spaced values between 0 to 1000 (inclusive).\n",
-        "arr = ...\n",
-        "arr"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "2f51aa2e-b994-4a91-aed6-4a4632eb7050",
-      "metadata": {
-        "id": "2f51aa2e-b994-4a91-aed6-4a4632eb7050"
-      },
-      "outputs": [],
-      "source": [
-        "# TODO: Create a random array that is 10_000 rows by 5_000 columns.\n",
-        "arr = ...\n",
-        "arr"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "4ec06270-6e08-4cce-9385-9dc8b53e95fd",
-      "metadata": {
-        "id": "4ec06270-6e08-4cce-9385-9dc8b53e95fd"
-      },
-      "outputs": [],
-      "source": [
-        "# TODO: Sort that array (in-place).\n",
-        "# Note: arr.sort() modifies the array directly, which is typically faster than creating a copy.\n",
-        "arr..."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "cdde560b-5ba6-484c-a601-00b7ef71273d",
-      "metadata": {
-        "id": "cdde560b-5ba6-484c-a601-00b7ef71273d"
-      },
-      "outputs": [],
-      "source": [
-        "# TODO: Reshape the array to have the last dimension of length 5. \n",
-        "# Ensure that the operation only changes the logical view without duplicating the physical data pointer.\n",
-        "# Hint: You can use -1 for one dimension to let NumPy automatically calculate the size based on the total elements.\n",
-        "arr_new = ...\n",
-        "arr_new"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "54982876",
-      "metadata": {},
-      "source": [
-        "## 4. Aggregations and Axes\n",
-        "\n",
-        "When performing aggregations (like `sum`, `mean`, `max`), you must specify the **Axis** you want to collapse (or reduce) the array along.\n",
-        "\n",
-        "- **Axis 0**: The first dimension (often rows in 2D). Aggregating across Axis 0 produces a result for each column.\n",
-        "- **Axis 1**: The second dimension (often columns in 2D). Aggregating across Axis 1 produces a result for each row.\n",
-        "\n",
-        "---\n",
-        "\n",
-        "**Quick Docs**\n",
-        "- `np.sum(a, axis=None)`: Sum of array elements over a given axis.\n",
-        "  - `axis=0`: Collapse the rows (sum vertical columns).\n",
-        "  - `axis=1`: Collapse the columns (sum horizontal rows).\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "44dd3ac2-c9b7-4327-ba63-860b074c0583",
-      "metadata": {
-        "id": "44dd3ac2-c9b7-4327-ba63-860b074c0583"
-      },
-      "outputs": [],
-      "source": [
-        "# TODO: Find the sum of each row in the reshaped array (arr_new) above.\n",
-        "# Hint: To sum the row's content, we must reduce across the columns.\n",
-        "arr_sum = ...\n",
-        "arr_sum"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "ed072cee",
-      "metadata": {},
-      "source": [
-        "## 5. Broadcasting: The \"Stretch\" Rule\n",
-        "\n",
-        "Broadcasting is NumPy's mechanism for performing arithmetic between arrays of different shapes. If dimensions don't match, NumPy attempts to \"stretch\" the smaller array to match the larger one.\n",
-        "\n",
-        "**The Compatibility Rule:** Two dimensions are compatible when:\n",
-        "1. They are equal, or\n",
-        "2. One of them is 1.\n",
-        "\n",
-        "If a dimension is 1, NumPy logically copies that single value across the dimension to match the other array's shape **without allocating any new memory**.\n",
-        "\n",
-        "---\n",
-        "\n",
-        "**Quick Docs**\n",
-        "- **Arithmetic Operators** (`/`, `*`, `+`, `-`): These operate element-wise. Broadcasting occurs if shapes are different but compatible.\n",
-        "- `np.allclose(a, b)`: Returns `True` if two floating-point arrays are element-wise equal within a tolerance. Essential for comparisons instead of using `==`.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "b15342af-2916-481a-9724-9874acf4ed24",
-      "metadata": {
-        "id": "b15342af-2916-481a-9724-9874acf4ed24"
-      },
-      "outputs": [],
-      "source": [
-        "# TODO: Normalize each row of the 2D array (arr_new) by dividing by the sum you just computed (arr_sum).\n",
-        "# Hint: 'arr_new' is (M, N) and 'arr_sum' is (M,). To successfully divide, you may need to reshape 'arr_sum' to (M, 1)\n",
-        "# so that broadcasting can stretch it across the N columns.\n",
-        "arr_normalized = ...\n",
-        "arr_normalized"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "b04622b8-c6de-4756-8a56-e3d2835a5eaf",
-      "metadata": {
-        "id": "b04622b8-c6de-4756-8a56-e3d2835a5eaf"
-      },
-      "outputs": [],
-      "source": [
-        "# EXTRA CREDIT: Prove that your normalized array is actually normalized.\n",
-        "# Hint: If normalized correctly, the sum of every row should now be 1.0.\n",
-        "# Check if the new row sums are close to 1.0 using np.allclose."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "31657dd2",
-      "metadata": {},
-      "source": [
-        "## 6. Why Vectorize? The Speed Advantage\n",
-        "\n",
-        "The entire Array Programming paradigm hinges on **Vectorization**.\n",
-        "\n",
-        "Why use complex shapes and broadcasting instead of simple Python `for` loops?\n",
-        "\n",
-        "NumPy's array functions are implemented in highly optimized native code (C/C++, Fortran). An operation like `A + A**2`, where `A` is a massive `ndarray`, is often $\\mathbf{100\\times}$ faster than performing the equivalent element-wise operation using explicit Python loops.\n",
-        "\n",
-        "**Always choose a vectorized NumPy function or operator over a manual Python loop.**"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "gpuType": "T4",
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
-      "language": "python",
-      "name": "python3"
-    }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "d2e341ff-0c1e-40e8-8c33-9e3039de8013",
+   "metadata": {
+    "id": "d2e341ff-0c1e-40e8-8c33-9e3039de8013"
+   },
+   "source": [
+    "# NumPy `ndarray` Basics"
+   ]
   },
-  "nbformat": 4,
-  "nbformat_minor": 5
+  {
+   "cell_type": "markdown",
+   "id": "a5ba6a1c",
+   "metadata": {},
+   "source": [
+    "## Table of Contents\n",
+    "\n",
+    "1. [The De Facto Standard for Array Data](#1.-The-De-Facto-Standard-for-Array-Data)\n",
+    "2. [Anatomy of an `ndarray`: Structure and Memory](#2.-Anatomy-of-an-`ndarray`:-Structure-and-Memory)\n",
+    "3. [Array Creation and Logical Views (Views vs. Copies)](#3.-Array-Creation-and-Logical-Views-(Views-vs.-Copies))\n",
+    "4. [Aggregations and Axes](#4.-Aggregations-and-Axes)\n",
+    "5. [Broadcasting: The \"Stretch\" Rule](#5.-Broadcasting:-The-\"Stretch\"-Rule)\n",
+    "6. [Why Vectorize? The Speed Advantage](#6.-Why-Vectorize?-The-Speed-Advantage)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b30427de",
+   "metadata": {},
+   "source": [
+    "## 1. The De Facto Standard for Array Data\n",
+    "\n",
+    "NumPy is the foundational library for High Performance Computing (HPC) and Machine Learning (ML) in Python. Libraries like PyTorch, Pandas, and Scikit-learn are built upon or mirror the NumPy API. Learning NumPy is essential for mastering the Array Programming paradigm.\n",
+    "\n",
+    "NumPy provides the `ndarray` (N-dimensional array), a powerful, high-performance, and uniform container that enables highly efficient memory management, indexing, slicing, and, most importantly, vectorized arithmetic."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc4596d8-d9ff-4c66-8822-246c0fc830c7",
+   "metadata": {
+    "id": "cc4596d8-d9ff-4c66-8822-246c0fc830c7"
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c59fce80",
+   "metadata": {},
+   "source": [
+    "## 2. Anatomy of an `ndarray`: Structure and Memory\n",
+    "\n",
+    "Unlike a standard Python list, an `ndarray` is a fixed-size, structured block of contiguous memory. Its efficiency comes from these four key, immutable properties:\n",
+    "\n",
+    "- **Data**: A pointer to the memory location holding the elements.\n",
+    "- **dtype**: The data type (e.g., `int32`, `float64`) which is uniform across all elements.\n",
+    "- **Shape**: A tuple defining the size along each dimension (e.g., $(100, 50)$ for 100 rows and 50 columns).\n",
+    "- **Strides**: The number of bytes to step in memory to reach the next element along each dimension—this is how NumPy efficiently handles different shapes and views.\n",
+    "\n",
+    "Let's explore these properties by creating a large dataset.\n",
+    "\n",
+    "---\n",
+    "\n",
+    "**Quick Docs**\n",
+    "- `np.arange(start, stop, step)`: Returns evenly spaced values in the half-open interval $[\\text{start}, \\text{stop})$.\n",
+    "- `arr.nbytes`: Total bytes consumed by the array's elements (in bytes).\n",
+    "- `arr.ndim`: The number of array dimensions (integer).\n",
+    "- `arr.size`: The total number of elements in the array (integer).\n",
+    "- `arr.shape`: The tuple of array dimensions.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "465e35bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Use a large number to clearly demonstrate the memory density of ndarrays\n",
+    "N = 50_000_000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f1a613f-bc87-4950-b195-a66bb5bc05d3",
+   "metadata": {
+    "id": "5f1a613f-bc87-4950-b195-a66bb5bc05d3"
+   },
+   "outputs": [],
+   "source": [
+    "# TODO: Create the input data array with the numbers 1 to 50_000_000 (inclusive).\n",
+    "# Hint: np.arange generates values within a half-open interval [start, stop)\n",
+    "arr = ..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "50530f2c-29bf-4061-8f84-bc5be00a5622",
+   "metadata": {
+    "id": "50530f2c-29bf-4061-8f84-bc5be00a5622"
+   },
+   "outputs": [],
+   "source": [
+    "# TODO: Calculate how large the array is in GB with nbytes.\n",
+    "# Hint: GB is 1e9 bytes. The .nbytes attribute returns the total bytes consumed by the elements.\n",
+    "# Note: This demonstrates that arrays are dense memory blocks, unlike pointer-heavy Python lists.\n",
+    "arr..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ffc15dad-e2fd-4b96-8b39-3496519d0656",
+   "metadata": {
+    "id": "ffc15dad-e2fd-4b96-8b39-3496519d0656"
+   },
+   "outputs": [],
+   "source": [
+    "# TODO: How many dimensions does the array have? (ndim)\n",
+    "arr..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b15cdf25-eb35-4926-b306-90ffd62b3d28",
+   "metadata": {
+    "id": "b15cdf25-eb35-4926-b306-90ffd62b3d28"
+   },
+   "outputs": [],
+   "source": [
+    "# TODO: How many elements does the array have? (size)\n",
+    "arr..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63887722-c9d7-405e-a019-e75646115541",
+   "metadata": {
+    "id": "63887722-c9d7-405e-a019-e75646115541"
+   },
+   "outputs": [],
+   "source": [
+    "# TODO: What is the shape of the array?\n",
+    "arr..."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5e58ee4",
+   "metadata": {},
+   "source": [
+    "## 3. Array Creation and Logical Views (Views vs. Copies)\n",
+    "\n",
+    "Arrays can logically represent data in many ways (e.g., 1D signal, 2D image, 4D video batch) independent of the underlying physical memory block.\n",
+    "\n",
+    "A critical performance feature is that operations like transposing or `reshape` often return a **View** instead of a **Copy**. A View only changes the metadata (`shape` and `strides`) without duplicating the physical data, making these operations nearly instantaneous.\n",
+    "\n",
+    "---\n",
+    "\n",
+    "**Quick Docs**\n",
+    "- `np.linspace(start, stop, num)`: Returns `num` evenly spaced samples, calculated over the interval $[\\text{start}, \\text{stop}]$.\n",
+    "- `np.random.default_rng().random(size)`: Returns random floats in $[0.0, 1.0)$. `size` can be a tuple.\n",
+    "- `arr.sort()`: Sorts an array in-place (modifies the original data). Use `np.sort(arr)` to return a sorted copy.\n",
+    "- `arr.reshape(new_shape)`: Returns a View with a new shape. One dimension can be -1, instructing NumPy to calculate the size automatically.\n",
+    "- `np.resize(arr, new_shape)`: Returns a new array with the specified shape. If the new shape is larger, it fills the new elements by repeating the original array.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1527b4f6-5d75-47d4-97e0-d0e78bbc59f9",
+   "metadata": {
+    "id": "1527b4f6-5d75-47d4-97e0-d0e78bbc59f9"
+   },
+   "outputs": [],
+   "source": [
+    "# TODO: Create a new array with 5_000_000 elements containing equally spaced values between 0 to 1000 (inclusive).\n",
+    "arr = ...\n",
+    "arr"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f51aa2e-b994-4a91-aed6-4a4632eb7050",
+   "metadata": {
+    "id": "2f51aa2e-b994-4a91-aed6-4a4632eb7050"
+   },
+   "outputs": [],
+   "source": [
+    "# TODO: Create a random array that is 10_000 rows by 5_000 columns.\n",
+    "arr = ...\n",
+    "arr"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4ec06270-6e08-4cce-9385-9dc8b53e95fd",
+   "metadata": {
+    "id": "4ec06270-6e08-4cce-9385-9dc8b53e95fd"
+   },
+   "outputs": [],
+   "source": [
+    "# TODO: Sort that array (in-place).\n",
+    "# Note: arr.sort() modifies the array directly, which is typically faster than creating a copy.\n",
+    "arr..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cdde560b-5ba6-484c-a601-00b7ef71273d",
+   "metadata": {
+    "id": "cdde560b-5ba6-484c-a601-00b7ef71273d"
+   },
+   "outputs": [],
+   "source": [
+    "# TODO: Reshape the array to have the last dimension of length 5. \n",
+    "# Ensure that the operation only changes the logical view without duplicating the physical data pointer.\n",
+    "# Hint: You can use -1 for one dimension to let NumPy automatically calculate the size based on the total elements.\n",
+    "arr_new = ...\n",
+    "arr_new"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "54982876",
+   "metadata": {},
+   "source": [
+    "## 4. Aggregations and Axes\n",
+    "\n",
+    "When performing aggregations (like `sum`, `mean`, `max`), you must specify the **Axis** you want to collapse (or reduce) the array along.\n",
+    "\n",
+    "- **Axis 0**: The first dimension (often rows in 2D). Aggregating across Axis 0 produces a result for each column.\n",
+    "- **Axis 1**: The second dimension (often columns in 2D). Aggregating across Axis 1 produces a result for each row.\n",
+    "\n",
+    "---\n",
+    "\n",
+    "**Quick Docs**\n",
+    "- `np.sum(a, axis=None)`: Sum of array elements over a given axis.\n",
+    "  - `axis=0`: Collapse the rows (sum vertical columns).\n",
+    "  - `axis=1`: Collapse the columns (sum horizontal rows).\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "44dd3ac2-c9b7-4327-ba63-860b074c0583",
+   "metadata": {
+    "id": "44dd3ac2-c9b7-4327-ba63-860b074c0583"
+   },
+   "outputs": [],
+   "source": [
+    "# TODO: Find the sum of each row in the reshaped array (arr_new) above.\n",
+    "# Hint: To sum the row's content, we must reduce across the columns.\n",
+    "arr_sum = ...\n",
+    "arr_sum"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ed072cee",
+   "metadata": {},
+   "source": [
+    "## 5. Broadcasting: The \"Stretch\" Rule\n",
+    "\n",
+    "Broadcasting is NumPy's mechanism for performing arithmetic between arrays of different shapes. If dimensions don't match, NumPy attempts to \"stretch\" the smaller array to match the larger one.\n",
+    "\n",
+    "**The Compatibility Rule:** Two dimensions are compatible when:\n",
+    "1. They are equal, or\n",
+    "2. One of them is 1.\n",
+    "\n",
+    "If a dimension is 1, NumPy logically copies that single value across the dimension to match the other array's shape **without allocating any new memory**.\n",
+    "\n",
+    "---\n",
+    "\n",
+    "**Quick Docs**\n",
+    "- **Arithmetic Operators** (`/`, `*`, `+`, `-`): These operate element-wise. Broadcasting occurs if shapes are different but compatible.\n",
+    "- `np.allclose(a, b)`: Returns `True` if two floating-point arrays are element-wise equal within a tolerance. Essential for comparisons instead of using `==`.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b15342af-2916-481a-9724-9874acf4ed24",
+   "metadata": {
+    "id": "b15342af-2916-481a-9724-9874acf4ed24"
+   },
+   "outputs": [],
+   "source": [
+    "# TODO: Normalize each row of the 2D array (arr_new) by dividing by the sum you just computed (arr_sum).\n",
+    "# Hint: 'arr_new' is (M, N) and 'arr_sum' is (M,). To successfully divide, you may need to reshape 'arr_sum' to (M, 1)\n",
+    "# so that broadcasting can stretch it across the N columns.\n",
+    "arr_normalized = ...\n",
+    "arr_normalized"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b04622b8-c6de-4756-8a56-e3d2835a5eaf",
+   "metadata": {
+    "id": "b04622b8-c6de-4756-8a56-e3d2835a5eaf"
+   },
+   "outputs": [],
+   "source": [
+    "# EXTRA CREDIT: Prove that your normalized array is actually normalized.\n",
+    "# Hint: If normalized correctly, the sum of every row should now be 1.0.\n",
+    "# Check if the new row sums are close to 1.0 using np.allclose."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "31657dd2",
+   "metadata": {},
+   "source": [
+    "## 6. Why Vectorize? The Speed Advantage\n",
+    "\n",
+    "The entire Array Programming paradigm hinges on **Vectorization**.\n",
+    "\n",
+    "Why use complex shapes and broadcasting instead of simple Python `for` loops?\n",
+    "\n",
+    "NumPy's array functions are implemented in highly optimized native code (C/C++, Fortran). An operation like `A + A**2`, where `A` is a massive `ndarray`, is often $\\mathbf{100\\times}$ faster than performing the equivalent element-wise operation using explicit Python loops.\n",
+    "\n",
+    "**Always choose a vectorized NumPy function or operator over a manual Python loop.**"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
 }
diff --git a/tutorials/accelerated-python/notebooks/fundamentals/02__numpy_linear_algebra__svd_reconstruction.ipynb b/tutorials/accelerated-python/notebooks/fundamentals/02__numpy_linear_algebra__svd_reconstruction.ipynb
index b6f7f1fd..65d3f06e 100644
--- a/tutorials/accelerated-python/notebooks/fundamentals/02__numpy_linear_algebra__svd_reconstruction.ipynb
+++ b/tutorials/accelerated-python/notebooks/fundamentals/02__numpy_linear_algebra__svd_reconstruction.ipynb
@@ -262,6 +262,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -277,14 +283,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.11"
-  },
-  "colab": {
-   "provenance": [],
-   "gpuType": "T4"
-  },
-  "accelerator": "GPU"
+   "version": "3.11.7"
+  }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 5
 }
diff --git a/tutorials/accelerated-python/notebooks/fundamentals/03__numpy_to_cupy__ndarray_basics.ipynb b/tutorials/accelerated-python/notebooks/fundamentals/03__numpy_to_cupy__ndarray_basics.ipynb
index d6843ff5..a0cc2859 100644
--- a/tutorials/accelerated-python/notebooks/fundamentals/03__numpy_to_cupy__ndarray_basics.ipynb
+++ b/tutorials/accelerated-python/notebooks/fundamentals/03__numpy_to_cupy__ndarray_basics.ipynb
@@ -1,501 +1,514 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "id": "f966f67f",
-      "metadata": {},
-      "source": [
-        "# Accelerated Computing with CuPy\n",
-        "\n",
-        "## Table of Contents\n",
-        "1. [Creating Arrays: CPU vs. GPU](#1.-Creating-Arrays:-CPU-vs.-GPU)\n",
-        "2. [Basic Operations](#2.-Basic-Operations)\n",
-        "   - [Sequential Operations & Memory](#Sequential-Operations-&-Memory)\n",
-        "3. [Complex Operations (Linear Algebra)](#3.-Complex-Operations-(Linear-Algebra))\n",
-        "   - [Agnostic Code (NumPy Dispatch)](#Agnostic-Code-(NumPy-Dispatch))\n",
-        "4. [Device Management](#4.-Device-Management)\n",
-        "5. [Exercise - NumPy to CuPy](#Exercise---NumPy-to-CuPy)\n",
-        "   - [Part 1](#Part-1)\n",
-        "   - [Part 2](#Part-2)\n",
-        "\n",
-        "---\n",
-        "\n",
-        "Let's shift gears to high-level array functionality using **[CuPy](https://cupy.dev/)**.\n",
-        "\n",
-        "### What is CuPy?\n",
-        "CuPy is a library that implements the familiar **NumPy API** but runs on the GPU (using CUDA C++ in the backend). \n",
-        "\n",
-        "**Why use it?**\n",
-        "* **Zero Friction:** If you know NumPy, you already know CuPy.\n",
-        "* **Speed:** It provides out-of-the-box GPU acceleration for array operations.\n",
-        "* **Ease of use:** You can often port CPU code to GPU simply by changing `import numpy as np` to `import cupy as cp`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "d369bcdc",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "import numpy as np\n",
-        "import cupy as cp\n",
-        "from cupyx.profiler import benchmark\n",
-        "\n",
-        "# Helper to display benchmark results concisely.\n",
-        "# We use CuPy's benchmark() throughout this notebook for accurate GPU timing.\n",
-        "def print_benchmark(result, device=\"gpu\"):\n",
-        "    \"\"\"Print benchmark result showing only the relevant time.\"\"\"\n",
-        "    if device == \"gpu\":\n",
-        "        avg_ms = result.gpu_times.mean() * 1000\n",
-        "        std_ms = result.gpu_times.std() * 1000\n",
-        "        print(f\"{result.name}: {avg_ms:.3f} ms +/- {std_ms:.3f} ms\")\n",
-        "    else:\n",
-        "        avg_ms = result.cpu_times.mean() * 1000\n",
-        "        std_ms = result.cpu_times.std() * 1000\n",
-        "        print(f\"{result.name}: {avg_ms:.3f} ms +/- {std_ms:.3f} ms\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "15fc304c",
-      "metadata": {},
-      "source": [
-        "## 1. Creating Arrays: CPU vs. GPU\n",
-        "\n",
-        "Let's compare the performance of creating a large 3D array (approx. 2GB in size) on the CPU versus the GPU.\n",
-        "\n",
-        "We will use `np.ones` for the CPU and `cp.ones` for the GPU.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "c0f8b002",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# CPU creation\n",
-        "print_benchmark(benchmark(np.ones, ((1000, 500, 500),), n_repeat=10), device=\"cpu\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "19309ca7",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# GPU creation\n",
-        "print_benchmark(benchmark(cp.ones, ((1000, 500, 500),), n_repeat=10), device=\"gpu\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "ae637eaf",
-      "metadata": {},
-      "source": [
-        "We can see here that creating this array on the GPU is much faster than doing so on the CPU!\n",
-        "\n",
-        "**About `cupyx.profiler.benchmark`:**\n",
-        "\n",
-        "We use CuPy's built-in `benchmark` utility for timing GPU operations. This is important because GPU operations are **asynchronous** - when you call a CuPy function, the CPU places a task in the GPU's \"to-do list\" (stream) and immediately moves on without waiting.\n",
-        "\n",
-        "The `benchmark` function handles all the complexity of proper GPU timing for us:\n",
-        "- It automatically synchronizes GPU streams to get accurate measurements.\n",
-        "- It runs warm-up iterations to avoid cold-start overhead.\n",
-        "- It reports both CPU and GPU times separately.\n",
-        "\n",
-        "This makes it the recommended way to time CuPy code, as it's both accurate and convenient."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "6d179e9b",
-      "metadata": {},
-      "source": [
-        "## 2. Basic Operations\n",
-        "\n",
-        "The syntax for mathematical operations is identical. Let's multiply every value in our arrays by `5`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "de5bdefb",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Create fresh arrays for the benchmark\n",
-        "x_cpu = np.ones((1000, 500, 500))\n",
-        "x_gpu = cp.ones((1000, 500, 500))\n",
-        "\n",
-        "def multiply(x):\n",
-        "    return x * 5\n",
-        "\n",
-        "# CPU Operation\n",
-        "print_benchmark(benchmark(multiply, (x_cpu,), n_repeat=10), device=\"cpu\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "6a7f32b8",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# GPU Operation\n",
-        "print_benchmark(benchmark(multiply, (x_gpu,), n_repeat=10), device=\"gpu\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "bc24579f",
-      "metadata": {},
-      "source": [
-        "The GPU completes this operation notably faster, with the code staying the same."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "83c69334",
-      "metadata": {},
-      "source": [
-        "### Sequential Operations & Memory\n",
-        "\n",
-        "Now let's do a couple of operations sequentially, something which would suffer from memory transfer times in Numba examples without explicit memory management."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "c0294dbc",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "def sequential_math(x):\n",
-        "    x = x * 5\n",
-        "    x = x * x\n",
-        "    x = x + x\n",
-        "    return x\n",
-        "\n",
-        "# CPU: Sequential math\n",
-        "print_benchmark(benchmark(sequential_math, (x_cpu,), n_repeat=10), device=\"cpu\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "acafdbe7",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# GPU: Sequential math\n",
-        "print_benchmark(benchmark(sequential_math, (x_gpu,), n_repeat=10), device=\"gpu\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "0f250bbb",
-      "metadata": {},
-      "source": [
-        "The GPU ran that much faster even without us explicitly managing memory. This is because CuPy is handling all of this for us transparently."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "84221268",
-      "metadata": {},
-      "source": [
-        "## 3. Complex Operations (Linear Algebra)\n",
-        "\n",
-        "GPUs excel at Linear Algebra. Let's look at **Singular Value Decomposition (SVD)**, a computationally heavy $O(N^3)$ operation."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "978af795",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# CPU SVD\n",
-        "x_cpu = np.random.random((1000, 1000))\n",
-        "print_benchmark(benchmark(np.linalg.svd, (x_cpu,), n_repeat=5), device=\"cpu\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "e0bc855b",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# GPU SVD\n",
-        "x_gpu = cp.random.random((1000, 1000))\n",
-        "print_benchmark(benchmark(cp.linalg.svd, (x_gpu,), n_repeat=5), device=\"gpu\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "e298f0ea",
-      "metadata": {},
-      "source": [
-        "The GPU outperforms the CPU again with exactly the same API!"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "4a0870d0",
-      "metadata": {},
-      "source": [
-        "### Agnostic Code (NumPy Dispatch)\n",
-        "\n",
-        "A key feature of CuPy is that many **NumPy functions work on CuPy arrays without changing your code**.\n",
-        "\n",
-        "When you pass a CuPy GPU array (`x_gpu`) into a NumPy function that supports the `__array_function__` protocol (e.g., `np.linalg.svd`), NumPy detects the CuPy input and **delegates the operation to CuPy’s own implementation**, which runs on the GPU.\n",
-        "\n",
-        "This allows you to write code using standard `np.*` syntax and have it run on either CPU or GPU seamlessly - **as long as CuPy implements an override for that function.**\n",
-        "\n",
-        "CuPy also protects you from hidden performance penalties: **it forbids implicit GPU → CPU copies**, raising a `TypeError` when NumPy tries to convert a `cupy.ndarray` into a `numpy.ndarray` behind the scenes. This ensures all device-to-host transfers are **explicit and intentional**, never silent."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "ba4f2863",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# We create the data on the GPU\n",
-        "x_gpu = cp.random.random((1000, 1000))\n",
-        "\n",
-        "# BUT we call the standard NumPy function - CuPy dispatches it to the GPU!\n",
-        "print_benchmark(benchmark(np.linalg.svd, (x_gpu,), n_repeat=5), device=\"gpu\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "6e37faae",
-      "metadata": {},
-      "source": [
-        "## 4. Device Management\n",
-        "\n",
-        "If you have multiple GPUs, CuPy uses the concept of a \"Current Device\" context. \n",
-        "\n",
-        "You can use a `with` statement to ensure specific arrays are created on specific cards (e.g., GPU 0 vs GPU 1).\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "26aa4f57",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "with cp.cuda.Device(0):\n",
-        "   x_on_gpu0 = cp.random.random((100000, 1000))\n",
-        "\n",
-        "print(f\"Array is on device: {x_on_gpu0.device}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "32f7226a",
-      "metadata": {},
-      "source": [
-        "**Note:** CuPy functions generally expect all input arrays to be on the **same** device. Passing an array stored on a non-current device may work depending on the hardware configuration but is generally discouraged as it may not be performant.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "2e0a4a03",
-      "metadata": {},
-      "source": [
-        "---"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "d2e341ff-0c1e-40e8-8c33-9e3039de8013",
-      "metadata": {
-        "id": "d2e341ff-0c1e-40e8-8c33-9e3039de8013"
-      },
-      "source": [
-        "## Exercise - NumPy to CuPy\n",
-        "\n",
-        "### Part 1\n",
-        "Let's put the \"Drop-in Replacement\" philosophy to the test with the same data pipeline as the previous notebook. Specifically, the single block of code below performs the following steps:\n",
-        "1) Generate a massive dataset (50 million elements).\n",
-        "2) Process it using a heavy operation (Sorting).\n",
-        "3) Manipulate the shape and normalize the data (Broadcasting).\n",
-        "4) Verify the integrity of the result.\n",
-        "\n",
-        "**TODO:**\n",
-        "1. Run the cell below with `xp = np` (CPU Mode). Note the benchmark output.\n",
-        "2. Change the setup line to `xp = cp` (GPU Mode). Run it again.\n",
-        "3. Observe how the exact same logic runs significantly faster on the GPU with CuPy while retaining the implementation properties of NumPy.\n",
-        "\n",
-        "Note: We use `cupyx.profiler.benchmark` for timing, which automatically handles GPU synchronization."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "cc4596d8-d9ff-4c66-8822-246c0fc830c7",
-      "metadata": {
-        "id": "cc4596d8-d9ff-4c66-8822-246c0fc830c7"
-      },
-      "outputs": [],
-      "source": [
-        "import numpy as np\n",
-        "import cupy as cp\n",
-        "from cupyx.profiler import benchmark\n",
-        "\n",
-        "# Re-defined here so this exercise cell is self-contained and can run independently.\n",
-        "def print_benchmark(result, device=\"gpu\"):\n",
-        "    \"\"\"Print benchmark result showing only the relevant time.\"\"\"\n",
-        "    if device == \"gpu\":\n",
-        "        avg_ms = result.gpu_times.mean() * 1000\n",
-        "        std_ms = result.gpu_times.std() * 1000\n",
-        "    else:\n",
-        "        avg_ms = result.cpu_times.mean() * 1000\n",
-        "        std_ms = result.cpu_times.std() * 1000\n",
-        "    print(f\"  -> {result.name}: {avg_ms:.3f} ms +/- {std_ms:.3f} ms\")\n",
-        "\n",
-        "# --- 1. SETUP: CHOOSE YOUR DEVICE ---\n",
-        "xp = np  # Toggle this to 'cp' for GPU acceleration\n",
-        "\n",
-        "print(f\"Running on: {xp.__name__.upper()}\")\n",
-        "\n",
-        "# --- 2. DATA GENERATION ---\n",
-        "N = 50_000_000\n",
-        "print(f\"Generating {N:,} random elements ({N*8/1e9:.2f} GB)...\")\n",
-        "arr = xp.random.rand(N)\n",
-        "\n",
-        "# --- 3. HEAVY COMPUTATION (TIMED) ---\n",
-        "print(\"Sorting data...\")\n",
-        "# benchmark() handles GPU synchronization automatically\n",
-        "result = benchmark(xp.sort, (arr,), n_repeat=5)\n",
-        "print_benchmark(result, device=\"gpu\" if xp == cp else \"cpu\")\n",
-        "\n",
-        "# --- 4. MANIPULATION & BROADCASTING ---\n",
-        "# Purpose: Demonstrate that CuPy supports complex reshaping and broadcasting rules exactly like NumPy.\n",
-        "# This shows you don't need to rewrite your data processing logic.\n",
-        "\n",
-        "# Reshape to a matrix with 5 columns\n",
-        "arr_new = arr.reshape((-1, 5))\n",
-        "\n",
-        "# Normalize: Divide every row by its sum using broadcasting\n",
-        "row_sums = arr_new.sum(axis=1)\n",
-        "normalized_matrix = arr_new / row_sums[:, xp.newaxis]\n",
-        "\n",
-        "# --- 5. VERIFICATION ---\n",
-        "# Purpose: Verify mathematical correctness/integrity of the result.\n",
-        "check_sums = xp.sum(normalized_matrix, axis=1)\n",
-        "xp.testing.assert_allclose(check_sums, 1.0)\n",
-        "\n",
-        "print(\"  -> Verification: PASSED (All rows sum to 1.0)\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "077b7589",
-      "metadata": {},
-      "source": [
-        "**TODO: When working with CuPy arrays, try changing `xp.testing.assert_allclose` to `np.testing.assert_allclose`. What happens and why?**"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "AxU_hG5M-LKS",
-      "metadata": {
-        "id": "AxU_hG5M-LKS"
-      },
-      "source": [
-        "### Part 2\n",
-        "We will now create a massive dataset (50 million points) representing a sine wave and see how fast the GPU can sort it compared to the CPU. \n",
-        "\n",
-        "**TODO:** \n",
-        "1) **Generate Data:** Create a NumPy array (`y_cpu`) and a CuPy array (`y_gpu`) representing $\\sin(x)$ from $0$ to $2\\pi$ with `50,000,000` points.\n",
-        "2) **Benchmark CPU and GPU:** Use `benchmark()` from `cupyx.profiler` to measure both `np.sort` and `cp.sort`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "EKwfS_iM9Yps",
-      "metadata": {
-        "id": "EKwfS_iM9Yps"
-      },
-      "outputs": [],
-      "source": [
-        "import numpy as np\n",
-        "import cupy as cp\n",
-        "from cupyx.profiler import benchmark\n",
-        "\n",
-        "# --- Step 1: Generate Data ---\n",
-        "N = 50_000_000\n",
-        "print(f\"Generating {N} points...\")\n",
-        "\n",
-        "# TODO: Create x_cpu using np.linspace from 0 to 2*pi\n",
-        "# TODO: Create y_cpu by taking np.sin(x_cpu)\n",
-        "\n",
-        "# TODO: Create x_gpu using cp.linspace from 0 to 2*pi\n",
-        "# TODO: Create y_gpu by taking cp.sin(x_gpu)\n",
-        "\n",
-        "\n",
-        "# --- Step 2: Benchmark NumPy (CPU) ---\n",
-        "print(\"Benchmarking NumPy Sort (this may take a few seconds)...\")\n",
-        "# TODO: Use benchmark(function, (args,), n_repeat=5)\n",
-        "# Hint: Pass the function `np.sort` and the argument `(y_cpu,)`\n",
-        "# Note: The comma in (y_cpu,) is required to make it a tuple!\n",
-        "\n",
-        "\n",
-        "# --- Step 3: Benchmark CuPy (GPU) ---\n",
-        "print(\"Benchmarking CuPy Sort...\")\n",
-        "# TODO: Use benchmark(function, (args,), n_repeat=5)\n",
-        "# Hint: Pass the function `cp.sort` and the argument `(y_gpu,)`\n",
-        "# Note: The comma in (y_gpu,) is required to make it a tuple!"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "qnAvEk5QFAA8",
-      "metadata": {
-        "id": "qnAvEk5QFAA8"
-      },
-      "source": [
-        "**EXTRA CREDIT: Benchmark with different array sizes and find the size at which CuPy and NumPy take the same amount of time. Try to extract the timing data from `cupyx.profiler.benchmark`'s return value and customize how the output is displayed. You could even make a graph.**"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "42YwwyrJFTyV",
-      "metadata": {
-        "id": "42YwwyrJFTyV"
-      },
-      "outputs": [],
-      "source": [
-        "sizes = [5, 50, 500, 5_000, 50_000, 500_000, 5_000_000, 50_000_000]\n",
-        "\n",
-        "# TODO"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "gpuType": "T4",
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
-      "language": "python",
-      "name": "python3"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 5
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f966f67f",
+   "metadata": {},
+   "source": [
+    "# Accelerated Computing with CuPy\n",
+    "\n",
+    "## Table of Contents\n",
+    "1. [Creating Arrays: CPU vs. GPU](#1.-Creating-Arrays:-CPU-vs.-GPU)\n",
+    "2. [Basic Operations](#2.-Basic-Operations)\n",
+    "   - [Sequential Operations & Memory](#Sequential-Operations-&-Memory)\n",
+    "3. [Complex Operations (Linear Algebra)](#3.-Complex-Operations-(Linear-Algebra))\n",
+    "   - [Agnostic Code (NumPy Dispatch)](#Agnostic-Code-(NumPy-Dispatch))\n",
+    "4. [Device Management](#4.-Device-Management)\n",
+    "5. [Exercise - NumPy to CuPy](#Exercise---NumPy-to-CuPy)\n",
+    "   - [Part 1](#Part-1)\n",
+    "   - [Part 2](#Part-2)\n",
+    "\n",
+    "---\n",
+    "\n",
+    "Let's shift gears to high-level array functionality using **[CuPy](https://cupy.dev/)**.\n",
+    "\n",
+    "### What is CuPy?\n",
+    "CuPy is a library that implements the familiar **NumPy API** but runs on the GPU (using CUDA C++ in the backend). \n",
+    "\n",
+    "**Why use it?**\n",
+    "* **Zero Friction:** If you know NumPy, you already know CuPy.\n",
+    "* **Speed:** It provides out-of-the-box GPU acceleration for array operations.\n",
+    "* **Ease of use:** You can often port CPU code to GPU simply by changing `import numpy as np` to `import cupy as cp`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d369bcdc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import cupy as cp\n",
+    "from cupyx.profiler import benchmark\n",
+    "\n",
+    "# Helper to display benchmark results concisely.\n",
+    "# We use CuPy's benchmark() throughout this notebook for accurate GPU timing.\n",
+    "def print_benchmark(result, device=\"gpu\"):\n",
+    "    \"\"\"Print benchmark result showing only the relevant time.\"\"\"\n",
+    "    if device == \"gpu\":\n",
+    "        avg_ms = result.gpu_times.mean() * 1000\n",
+    "        std_ms = result.gpu_times.std() * 1000\n",
+    "        print(f\"{result.name}: {avg_ms:.3f} ms +/- {std_ms:.3f} ms\")\n",
+    "    else:\n",
+    "        avg_ms = result.cpu_times.mean() * 1000\n",
+    "        std_ms = result.cpu_times.std() * 1000\n",
+    "        print(f\"{result.name}: {avg_ms:.3f} ms +/- {std_ms:.3f} ms\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15fc304c",
+   "metadata": {},
+   "source": [
+    "## 1. Creating Arrays: CPU vs. GPU\n",
+    "\n",
+    "Let's compare the performance of creating a large 3D array (approx. 2GB in size) on the CPU versus the GPU.\n",
+    "\n",
+    "We will use `np.ones` for the CPU and `cp.ones` for the GPU.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c0f8b002",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# CPU creation\n",
+    "print_benchmark(benchmark(np.ones, ((1000, 500, 500),), n_repeat=10), device=\"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19309ca7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# GPU creation\n",
+    "print_benchmark(benchmark(cp.ones, ((1000, 500, 500),), n_repeat=10), device=\"gpu\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae637eaf",
+   "metadata": {},
+   "source": [
+    "We can see here that creating this array on the GPU is much faster than doing so on the CPU!\n",
+    "\n",
+    "**About `cupyx.profiler.benchmark`:**\n",
+    "\n",
+    "We use CuPy's built-in `benchmark` utility for timing GPU operations. This is important because GPU operations are **asynchronous** - when you call a CuPy function, the CPU places a task in the GPU's \"to-do list\" (stream) and immediately moves on without waiting.\n",
+    "\n",
+    "The `benchmark` function handles all the complexity of proper GPU timing for us:\n",
+    "- It automatically synchronizes GPU streams to get accurate measurements.\n",
+    "- It runs warm-up iterations to avoid cold-start overhead.\n",
+    "- It reports both CPU and GPU times separately.\n",
+    "\n",
+    "This makes it the recommended way to time CuPy code, as it's both accurate and convenient."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6d179e9b",
+   "metadata": {},
+   "source": [
+    "## 2. Basic Operations\n",
+    "\n",
+    "The syntax for mathematical operations is identical. Let's multiply every value in our arrays by `5`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "de5bdefb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create fresh arrays for the benchmark\n",
+    "x_cpu = np.ones((1000, 500, 500))\n",
+    "x_gpu = cp.ones((1000, 500, 500))\n",
+    "\n",
+    "def multiply(x):\n",
+    "    return x * 5\n",
+    "\n",
+    "# CPU Operation\n",
+    "print_benchmark(benchmark(multiply, (x_cpu,), n_repeat=10), device=\"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6a7f32b8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# GPU Operation\n",
+    "print_benchmark(benchmark(multiply, (x_gpu,), n_repeat=10), device=\"gpu\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bc24579f",
+   "metadata": {},
+   "source": [
+    "The GPU completes this operation notably faster, with the code staying the same."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "83c69334",
+   "metadata": {},
+   "source": [
+    "### Sequential Operations & Memory\n",
+    "\n",
+    "Now let's do a couple of operations sequentially, something which would suffer from memory transfer times in Numba examples without explicit memory management."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c0294dbc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def sequential_math(x):\n",
+    "    x = x * 5\n",
+    "    x = x * x\n",
+    "    x = x + x\n",
+    "    return x\n",
+    "\n",
+    "# CPU: Sequential math\n",
+    "print_benchmark(benchmark(sequential_math, (x_cpu,), n_repeat=10), device=\"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "acafdbe7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# GPU: Sequential math\n",
+    "print_benchmark(benchmark(sequential_math, (x_gpu,), n_repeat=10), device=\"gpu\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0f250bbb",
+   "metadata": {},
+   "source": [
+    "The GPU ran that much faster even without us explicitly managing memory. This is because CuPy is handling all of this for us transparently."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "84221268",
+   "metadata": {},
+   "source": [
+    "## 3. Complex Operations (Linear Algebra)\n",
+    "\n",
+    "GPUs excel at Linear Algebra. Let's look at **Singular Value Decomposition (SVD)**, a computationally heavy $O(N^3)$ operation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "978af795",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# CPU SVD\n",
+    "x_cpu = np.random.random((1000, 1000))\n",
+    "print_benchmark(benchmark(np.linalg.svd, (x_cpu,), n_repeat=5), device=\"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e0bc855b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# GPU SVD\n",
+    "x_gpu = cp.random.random((1000, 1000))\n",
+    "print_benchmark(benchmark(cp.linalg.svd, (x_gpu,), n_repeat=5), device=\"gpu\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e298f0ea",
+   "metadata": {},
+   "source": [
+    "The GPU outperforms the CPU again with exactly the same API!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4a0870d0",
+   "metadata": {},
+   "source": [
+    "### Agnostic Code (NumPy Dispatch)\n",
+    "\n",
+    "A key feature of CuPy is that many **NumPy functions work on CuPy arrays without changing your code**.\n",
+    "\n",
+    "When you pass a CuPy GPU array (`x_gpu`) into a NumPy function that supports the `__array_function__` protocol (e.g., `np.linalg.svd`), NumPy detects the CuPy input and **delegates the operation to CuPy’s own implementation**, which runs on the GPU.\n",
+    "\n",
+    "This allows you to write code using standard `np.*` syntax and have it run on either CPU or GPU seamlessly - **as long as CuPy implements an override for that function.**\n",
+    "\n",
+    "CuPy also protects you from hidden performance penalties: **it forbids implicit GPU → CPU copies**, raising a `TypeError` when NumPy tries to convert a `cupy.ndarray` into a `numpy.ndarray` behind the scenes. This ensures all device-to-host transfers are **explicit and intentional**, never silent."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ba4f2863",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We create the data on the GPU\n",
+    "x_gpu = cp.random.random((1000, 1000))\n",
+    "\n",
+    "# BUT we call the standard NumPy function - CuPy dispatches it to the GPU!\n",
+    "print_benchmark(benchmark(np.linalg.svd, (x_gpu,), n_repeat=5), device=\"gpu\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6e37faae",
+   "metadata": {},
+   "source": [
+    "## 4. Device Management\n",
+    "\n",
+    "If you have multiple GPUs, CuPy uses the concept of a \"Current Device\" context. \n",
+    "\n",
+    "You can use a `with` statement to ensure specific arrays are created on specific cards (e.g., GPU 0 vs GPU 1).\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26aa4f57",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with cp.cuda.Device(0):\n",
+    "   x_on_gpu0 = cp.random.random((100000, 1000))\n",
+    "\n",
+    "print(f\"Array is on device: {x_on_gpu0.device}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "32f7226a",
+   "metadata": {},
+   "source": [
+    "**Note:** CuPy functions generally expect all input arrays to be on the **same** device. Passing an array stored on a non-current device may work depending on the hardware configuration but is generally discouraged as it may not be performant.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2e0a4a03",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d2e341ff-0c1e-40e8-8c33-9e3039de8013",
+   "metadata": {
+    "id": "d2e341ff-0c1e-40e8-8c33-9e3039de8013"
+   },
+   "source": [
+    "## Exercise - NumPy to CuPy\n",
+    "\n",
+    "### Part 1\n",
+    "Let's put the \"Drop-in Replacement\" philosophy to the test with the same data pipeline as the previous notebook. Specifically, the single block of code below performs the following steps:\n",
+    "1) Generate a massive dataset (50 million elements).\n",
+    "2) Process it using a heavy operation (Sorting).\n",
+    "3) Manipulate the shape and normalize the data (Broadcasting).\n",
+    "4) Verify the integrity of the result.\n",
+    "\n",
+    "**TODO:**\n",
+    "1. Run the cell below with `xp = np` (CPU Mode). Note the benchmark output.\n",
+    "2. Change the setup line to `xp = cp` (GPU Mode). Run it again.\n",
+    "3. Observe how the exact same logic runs significantly faster on the GPU with CuPy while retaining the implementation properties of NumPy.\n",
+    "\n",
+    "Note: We use `cupyx.profiler.benchmark` for timing, which automatically handles GPU synchronization."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc4596d8-d9ff-4c66-8822-246c0fc830c7",
+   "metadata": {
+    "id": "cc4596d8-d9ff-4c66-8822-246c0fc830c7"
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import cupy as cp\n",
+    "from cupyx.profiler import benchmark\n",
+    "\n",
+    "# Re-defined here so this exercise cell is self-contained and can run independently.\n",
+    "def print_benchmark(result, device=\"gpu\"):\n",
+    "    \"\"\"Print benchmark result showing only the relevant time.\"\"\"\n",
+    "    if device == \"gpu\":\n",
+    "        avg_ms = result.gpu_times.mean() * 1000\n",
+    "        std_ms = result.gpu_times.std() * 1000\n",
+    "    else:\n",
+    "        avg_ms = result.cpu_times.mean() * 1000\n",
+    "        std_ms = result.cpu_times.std() * 1000\n",
+    "    print(f\"  -> {result.name}: {avg_ms:.3f} ms +/- {std_ms:.3f} ms\")\n",
+    "\n",
+    "# --- 1. SETUP: CHOOSE YOUR DEVICE ---\n",
+    "xp = np  # Toggle this to 'cp' for GPU acceleration\n",
+    "\n",
+    "print(f\"Running on: {xp.__name__.upper()}\")\n",
+    "\n",
+    "# --- 2. DATA GENERATION ---\n",
+    "N = 50_000_000\n",
+    "print(f\"Generating {N:,} random elements ({N*8/1e9:.2f} GB)...\")\n",
+    "arr = xp.random.rand(N)\n",
+    "\n",
+    "# --- 3. HEAVY COMPUTATION (TIMED) ---\n",
+    "print(\"Sorting data...\")\n",
+    "# benchmark() handles GPU synchronization automatically\n",
+    "result = benchmark(xp.sort, (arr,), n_repeat=5)\n",
+    "print_benchmark(result, device=\"gpu\" if xp == cp else \"cpu\")\n",
+    "\n",
+    "# --- 4. MANIPULATION & BROADCASTING ---\n",
+    "# Purpose: Demonstrate that CuPy supports complex reshaping and broadcasting rules exactly like NumPy.\n",
+    "# This shows you don't need to rewrite your data processing logic.\n",
+    "\n",
+    "# Reshape to a matrix with 5 columns\n",
+    "arr_new = arr.reshape((-1, 5))\n",
+    "\n",
+    "# Normalize: Divide every row by its sum using broadcasting\n",
+    "row_sums = arr_new.sum(axis=1)\n",
+    "normalized_matrix = arr_new / row_sums[:, xp.newaxis]\n",
+    "\n",
+    "# --- 5. VERIFICATION ---\n",
+    "# Purpose: Verify mathematical correctness/integrity of the result.\n",
+    "check_sums = xp.sum(normalized_matrix, axis=1)\n",
+    "xp.testing.assert_allclose(check_sums, 1.0)\n",
+    "\n",
+    "print(\"  -> Verification: PASSED (All rows sum to 1.0)\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "077b7589",
+   "metadata": {},
+   "source": [
+    "**TODO: When working with CuPy arrays, try changing `xp.testing.assert_allclose` to `np.testing.assert_allclose`. What happens and why?**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "AxU_hG5M-LKS",
+   "metadata": {
+    "id": "AxU_hG5M-LKS"
+   },
+   "source": [
+    "### Part 2\n",
+    "We will now create a massive dataset (50 million points) representing a sine wave and see how fast the GPU can sort it compared to the CPU. \n",
+    "\n",
+    "**TODO:** \n",
+    "1) **Generate Data:** Create a NumPy array (`y_cpu`) and a CuPy array (`y_gpu`) representing $\\sin(x)$ from $0$ to $2\\pi$ with `50,000,000` points.\n",
+    "2) **Benchmark CPU and GPU:** Use `benchmark()` from `cupyx.profiler` to measure both `np.sort` and `cp.sort`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "EKwfS_iM9Yps",
+   "metadata": {
+    "id": "EKwfS_iM9Yps"
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import cupy as cp\n",
+    "from cupyx.profiler import benchmark\n",
+    "\n",
+    "# --- Step 1: Generate Data ---\n",
+    "N = 50_000_000\n",
+    "print(f\"Generating {N} points...\")\n",
+    "\n",
+    "# TODO: Create x_cpu using np.linspace from 0 to 2*pi\n",
+    "# TODO: Create y_cpu by taking np.sin(x_cpu)\n",
+    "\n",
+    "# TODO: Create x_gpu using cp.linspace from 0 to 2*pi\n",
+    "# TODO: Create y_gpu by taking cp.sin(x_gpu)\n",
+    "\n",
+    "\n",
+    "# --- Step 2: Benchmark NumPy (CPU) ---\n",
+    "print(\"Benchmarking NumPy Sort (this may take a few seconds)...\")\n",
+    "# TODO: Use benchmark(function, (args,), n_repeat=5)\n",
+    "# Hint: Pass the function `np.sort` and the argument `(y_cpu,)`\n",
+    "# Note: The comma in (y_cpu,) is required to make it a tuple!\n",
+    "\n",
+    "\n",
+    "# --- Step 3: Benchmark CuPy (GPU) ---\n",
+    "print(\"Benchmarking CuPy Sort...\")\n",
+    "# TODO: Use benchmark(function, (args,), n_repeat=5)\n",
+    "# Hint: Pass the function `cp.sort` and the argument `(y_gpu,)`\n",
+    "# Note: The comma in (y_gpu,) is required to make it a tuple!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "qnAvEk5QFAA8",
+   "metadata": {
+    "id": "qnAvEk5QFAA8"
+   },
+   "source": [
+    "**EXTRA CREDIT: Benchmark with different array sizes and find the size at which CuPy and NumPy take the same amount of time. Try to extract the timing data from `cupyx.profiler.benchmark`'s return value and customize how the output is displayed. You could even make a graph.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "42YwwyrJFTyV",
+   "metadata": {
+    "id": "42YwwyrJFTyV"
+   },
+   "outputs": [],
+   "source": [
+    "sizes = [5, 50, 500, 5_000, 50_000, 500_000, 5_000_000, 50_000_000]\n",
+    "\n",
+    "# TODO"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
 }
diff --git a/tutorials/accelerated-python/notebooks/fundamentals/04__numpy_to_cupy__svd_reconstruction.ipynb b/tutorials/accelerated-python/notebooks/fundamentals/04__numpy_to_cupy__svd_reconstruction.ipynb
index 1a6b612e..f585d9a0 100644
--- a/tutorials/accelerated-python/notebooks/fundamentals/04__numpy_to_cupy__svd_reconstruction.ipynb
+++ b/tutorials/accelerated-python/notebooks/fundamentals/04__numpy_to_cupy__svd_reconstruction.ipynb
@@ -1,435 +1,436 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "hdnif2_TxAjl"
-      },
-      "source": [
-        "## Exercise - NumPy to CuPy - SVD Reconstruction\n",
-        "\n",
-        "Let's try another NumPy to CuPy porting exercise, this time with the SVD reconstruction code from before.\n",
-        "\n",
-        "**TODO: Port this code to CuPy. Here's what you'll have to do:**\n",
-        "\n",
-        "- **Change `import numpy as xp` to `import cupy as xp`.**\n",
-        "- **NumPy arrays are converted to CuPy arrays using `xp.asarray()`. You'll see errors like `only supports cupy.ndarray` if you forget to do this.**\n",
-        "- **CuPy arrays are converted back to NumPy arrays (for Matplotlib) using `xp.asnumpy()`.**\n",
-        "\n",
-        "First, we need to import the compute vision and plotting stack we're using:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "mLnsPrdx5hP0"
-      },
-      "outputs": [],
-      "source": [
-        "import matplotlib.pyplot as plt\n",
-        "import cv2\n",
-        "import numpy as xp"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "lVI52P80xAj5"
-      },
-      "source": [
-        "Next let's download an image of Bryce's dog:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "LaH1Q3DWFpa4"
-      },
-      "outputs": [],
-      "source": [
-        "import urllib.request\n",
-        "urllib.request.urlretrieve(\n",
-        "  \"https://drive.usercontent.google.com/download?id=1ClKrHt4-SIHaeBJdF0K3MG64jyVnt62L&export=download\",\n",
-        "  \"loonie.jpg\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "jFgvt9fHxAkG"
-      },
-      "source": [
-        "Then we read the image in grayscale mode:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "rcaTYmdpiElb"
-      },
-      "outputs": [],
-      "source": [
-        "image = cv2.imread(\"loonie.jpg\", cv2.IMREAD_GRAYSCALE)\n",
-        "\n",
-        "print(f\"nbytes: {image.nbytes}\")\n",
-        "print(f\"shape: {image.shape}\")\n",
-        "print(image)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "PAvIt5iJxAkK"
-      },
-      "source": [
-        "Here we can see the image is 1600x1200 pixels, and each pixel is an unsigned 8-bit value (0-255).  Let's plot it with matplotlib to verify it looks correct:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "H8DnT8Ro7L8l"
-      },
-      "outputs": [],
-      "source": [
-        "plt.imshow(image, cmap=\"gray\")\n",
-        "plt.title(\"Bryce's Dog\")\n",
-        "plt.show()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "k-C5yXn7xAkS"
-      },
-      "source": [
-        "Yes, we can confirm that is a dog (and a very cute one at that).  Now let's start doing some linear algebra!\n",
-        "\n",
-        "NumPy provides an [implementation of SVD](https://numpy.org/doc/stable/reference/generated/numpy.linalg.svd.html).  By selecting `full_matrices=False`, we get the singular value matrix, `S`, as a 1D vector rather than a 2D diagonal matrix."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "U12FLtFU7I0f"
-      },
-      "outputs": [],
-      "source": [
-        "U, S, Vt = xp.linalg.svd(image, full_matrices=False)\n",
-        "U.shape, S.shape, Vt.shape"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "jdurZSEmxAkW"
-      },
-      "source": [
-        "Since the image is not square and we've not selected `full_matrices`, NumPy returns `U` as a non-square matrix, `S` as the 1D vector which is the smaller of the two dimensions, and the `Vt` matrix is a square matrix.\n",
-        "\n",
-        "The singular values are returned in descending order, which we can see if we look at the first 10 elements of `S`:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "MJEM_K7jxAkX"
-      },
-      "outputs": [],
-      "source": [
-        "S[:10]"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "peIZQRlAxAkY"
-      },
-      "source": [
-        "In fact, if we look at the size of the singular values, we see that the first few contribute a lot to the matrix, and then fall off very rapidly:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "wHMliqddxAkY"
-      },
-      "outputs": [],
-      "source": [
-        "plt.semilogy(S)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "m1g2UwP1xAkZ"
-      },
-      "source": [
-        "That suggests we can get a pretty good approximation of the original image with a relatively small number of terms.  We can reconstruct the image matrix by slicing the `U`, `S`, and `Vt` matrices and remultiplying them.  We will need to convert `S` back into a 2D matrix for the multiplication as well.  Note that we are using the `@` operator to perform matrix multiplication, because `*` does element-wise multiplication."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ABIkKrCpxAkZ"
-      },
-      "outputs": [],
-      "source": [
-        "# First 3 terms.\n",
-        "nterms = 3\n",
-        "reconstructed = U[:, :nterms] @ xp.diag(S[:nterms]) @ Vt[:nterms, :]\n",
-        "plt.imshow(reconstructed, cmap=\"gray\")\n",
-        "plt.title(\"n = 3\")\n",
-        "plt.show()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "qAZDQDIRxAka"
-      },
-      "source": [
-        "That's still pretty fuzzy, so let's check out the image with more terms included:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "v4ihZ_3_7X_P"
-      },
-      "outputs": [],
-      "source": [
-        "plt.figure(figsize=(16, 4))\n",
-        "\n",
-        "start, end, step = 10, 50, 10\n",
-        "for i in range(start, end, step):\n",
-        "  plt.subplot(1, (end - start) // step + 1, (i - start) // step + 1)\n",
-        "  reconstructed = U[:, :i] @ xp.diag(S[:i]) @ Vt[:i, :]\n",
-        "  plt.imshow(reconstructed, cmap=\"gray\")\n",
-        "  plt.title(f\"n = {i}\")\n",
-        "\n",
-        "plt.tight_layout()\n",
-        "plt.show()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "weLC3cKaxAkc"
-      },
-      "source": [
-        "**EXTRA CREDIT: After you port this for loop to CuPy, consider the flow of compute and I/O. Are there any problems with this pattern? How could it be improved?**\n",
-        "\n",
-        "Now we'll print the compression ratio for the values of `n` used above.  This is the number of bytes of the reduced arrays added together and divided by the size of the original grayscale image array.  It seems we can get significant storage savings with this technique."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Pu2lao_KxAkd"
-      },
-      "outputs": [],
-      "source": [
-        "for i in range(start, end, step):\n",
-        "  compress_ratio = (U[:, :i].nbytes + S[:i].nbytes + Vt[:i, :].nbytes) / image.nbytes\n",
-        "  print(f\"n = {i}: compression = {compress_ratio:.1%}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "W1EiUWXKxAkf"
-      },
-      "source": [
-        "Next, we compute and display the difference in the reconstruction for `n = 10` and the original image using `cmap=\"coolwarm\"` to display the difference."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "dPUpGb37xAkg"
-      },
-      "outputs": [],
-      "source": [
-        "delta = image - (U[:,:10] @ xp.diag(S[:10]) @ Vt[:10,:])\n",
-        "plt.imshow(delta, cmap=\"coolwarm\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "5ypywm1uxAkh"
-      },
-      "source": [
-        "Now that you have gotten SVD to work on CuPy, let's benchmark it!  To make things clearer, let's reimport NumPy and CuPy with their usual abbreviations:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "0AjBKJuExAki"
-      },
-      "outputs": [],
-      "source": [
-        "import numpy as np\n",
-        "import cupy as cp\n",
-        "import cupyx as cpx # For `cupyx.profiler.benchmark`."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "QJGruJIzxAki"
-      },
-      "source": [
-        "We have to be very careful when benchmarking GPU code. As GPU programming is inherently asynchronous, so it can be tricky to make sure we're measuring the right thing.\n",
-        "\n",
-        "Imagine you're measuring how long it takes to ship a package to someone, but you only time how long it takes for you to drop it off at the post office, not how long it takes for them to receive it and send you a thank you.\n",
-        "\n",
-        "Common Pythonic benchmarking tools like `%timeit` are not GPU aware, so it's easy to measure incorrectly with them.  We can only use them when we know the code we're benchmarking will perform the proper synchronization.  It's better to use something like [`cupyx.profiler.benchmark`](https://docs.cupy.dev/en/stable/reference/generated/cupyx.profiler.benchmark.html#cupyx.profiler.benchmark).\n",
-        "\n",
-        "First, we need a NumPy (CPU) and CuPy (GPU) copy of our image:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "fr2zfGYIxAkj"
-      },
-      "outputs": [],
-      "source": [
-        "cpu_image = cv2.imread('loonie.jpg', cv2.IMREAD_GRAYSCALE)\n",
-        "gpu_image = cp.asarray(cpu_image)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "WNb_Jv88xAkl"
-      },
-      "source": [
-        "Next let's benchmark both CPU and GPU execution:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "WwnvkVsnxAkm"
-      },
-      "outputs": [],
-      "source": [
-        "repeat = 10\n",
-        "warmup = 1\n",
-        "D_np = cpx.profiler.benchmark(n_repeat=repeat, n_warmup=warmup, func=lambda:\n",
-        "  np.linalg.svd(cpu_image, full_matrices=False)\n",
-        ").cpu_times\n",
-        "D_cp = cpx.profiler.benchmark(n_repeat=repeat, n_warmup=warmup, func=lambda:\n",
-        "  cp.linalg.svd(gpu_image, full_matrices=False)\n",
-        ").cpu_times\n",
-        "\n",
-        "print(f\"SVD (Host)   = {D_np.mean():.3g} s ± {(D_np.std() / D_np.mean()):.2%} (mean ± relative stdev of {D_np.size} runs)\")\n",
-        "print(f\"SVD (Device) = {D_cp.mean():.3g} s ± {(D_cp.std() / D_cp.mean()):.2%} (mean ± relative stdev of {D_cp.size} runs)\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "TE6qPht1xAkm"
-      },
-      "source": [
-        "Depending on your hardware, the CPU and GPU might be close to the same speed, or the GPU might even be slower!  This is because the image is not big enough to fully utilize the GPU.  We can simulate a larger image by tiling the image using `np.tile`.  This duplicates the image both along axis 0 and axis 1:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "P-Hl0D5BxAkn"
-      },
-      "outputs": [],
-      "source": [
-        "cpu_image_tile = np.tile(cpu_image, (2, 2))\n",
-        "gpu_image_tile = cp.asarray(cpu_image_tile)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "0pUnjfzYxAkt"
-      },
-      "source": [
-        "Now we can benchmark again (this will take longer because the matrices are much bigger):"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "DgilXuzbxAku"
-      },
-      "outputs": [],
-      "source": [
-        "repeat = 5\n",
-        "warmup = 1\n",
-        "D_np = cpx.profiler.benchmark(n_repeat=repeat, n_warmup=warmup, func=lambda:\n",
-        "  np.linalg.svd(cpu_image_tile, full_matrices=False)\n",
-        ").cpu_times\n",
-        "D_cp = cpx.profiler.benchmark(n_repeat=repeat, n_warmup=warmup, func=lambda:\n",
-        "  cp.linalg.svd(gpu_image_tile, full_matrices=False)\n",
-        ").cpu_times\n",
-        "\n",
-        "print(f\"SVD (Host)   = {D_np.mean():.3g} s ± {(D_np.std() / D_np.mean()):.2%} (mean ± relative stdev of {D_np.size} runs)\")\n",
-        "print(f\"SVD (Device) = {D_cp.mean():.3g} s ± {(D_cp.std() / D_cp.mean()):.2%} (mean ± relative stdev of {D_cp.size} runs)\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "5nlgOqkBxAkw"
-      },
-      "source": [
-        "**TODO: Experiment with differ sizes of image by changing the `np.tile` arguments.  When is the GPU faster?**"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "gpuType": "T4",
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.12.3"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "hdnif2_TxAjl"
+   },
+   "source": [
+    "## Exercise - NumPy to CuPy - SVD Reconstruction\n",
+    "\n",
+    "Let's try another NumPy to CuPy porting exercise, this time with the SVD reconstruction code from before.\n",
+    "\n",
+    "**TODO: Port this code to CuPy. Here's what you'll have to do:**\n",
+    "\n",
+    "- **Change `import numpy as xp` to `import cupy as xp`.**\n",
+    "- **NumPy arrays are converted to CuPy arrays using `xp.asarray()`. You'll see errors like `only supports cupy.ndarray` if you forget to do this.**\n",
+    "- **CuPy arrays are converted back to NumPy arrays (for Matplotlib) using `xp.asnumpy()`.**\n",
+    "\n",
+    "First, we need to import the compute vision and plotting stack we're using:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "mLnsPrdx5hP0"
+   },
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import cv2\n",
+    "import numpy as xp"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "lVI52P80xAj5"
+   },
+   "source": [
+    "Next let's download an image of Bryce's dog:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "LaH1Q3DWFpa4"
+   },
+   "outputs": [],
+   "source": [
+    "import urllib.request\n",
+    "urllib.request.urlretrieve(\n",
+    "  \"https://drive.usercontent.google.com/download?id=1ClKrHt4-SIHaeBJdF0K3MG64jyVnt62L&export=download\",\n",
+    "  \"loonie.jpg\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "jFgvt9fHxAkG"
+   },
+   "source": [
+    "Then we read the image in grayscale mode:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "rcaTYmdpiElb"
+   },
+   "outputs": [],
+   "source": [
+    "image = cv2.imread(\"loonie.jpg\", cv2.IMREAD_GRAYSCALE)\n",
+    "\n",
+    "print(f\"nbytes: {image.nbytes}\")\n",
+    "print(f\"shape: {image.shape}\")\n",
+    "print(image)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "PAvIt5iJxAkK"
+   },
+   "source": [
+    "Here we can see the image is 1600x1200 pixels, and each pixel is an unsigned 8-bit value (0-255).  Let's plot it with matplotlib to verify it looks correct:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "H8DnT8Ro7L8l"
+   },
+   "outputs": [],
+   "source": [
+    "plt.imshow(image, cmap=\"gray\")\n",
+    "plt.title(\"Bryce's Dog\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "k-C5yXn7xAkS"
+   },
+   "source": [
+    "Yes, we can confirm that is a dog (and a very cute one at that).  Now let's start doing some linear algebra!\n",
+    "\n",
+    "NumPy provides an [implementation of SVD](https://numpy.org/doc/stable/reference/generated/numpy.linalg.svd.html).  By selecting `full_matrices=False`, we get the singular value matrix, `S`, as a 1D vector rather than a 2D diagonal matrix."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "U12FLtFU7I0f"
+   },
+   "outputs": [],
+   "source": [
+    "U, S, Vt = xp.linalg.svd(image, full_matrices=False)\n",
+    "U.shape, S.shape, Vt.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "jdurZSEmxAkW"
+   },
+   "source": [
+    "Since the image is not square and we've not selected `full_matrices`, NumPy returns `U` as a non-square matrix, `S` as the 1D vector which is the smaller of the two dimensions, and the `Vt` matrix is a square matrix.\n",
+    "\n",
+    "The singular values are returned in descending order, which we can see if we look at the first 10 elements of `S`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "MJEM_K7jxAkX"
+   },
+   "outputs": [],
+   "source": [
+    "S[:10]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "peIZQRlAxAkY"
+   },
+   "source": [
+    "In fact, if we look at the size of the singular values, we see that the first few contribute a lot to the matrix, and then fall off very rapidly:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "wHMliqddxAkY"
+   },
+   "outputs": [],
+   "source": [
+    "plt.semilogy(S)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "m1g2UwP1xAkZ"
+   },
+   "source": [
+    "That suggests we can get a pretty good approximation of the original image with a relatively small number of terms.  We can reconstruct the image matrix by slicing the `U`, `S`, and `Vt` matrices and remultiplying them.  We will need to convert `S` back into a 2D matrix for the multiplication as well.  Note that we are using the `@` operator to perform matrix multiplication, because `*` does element-wise multiplication."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "ABIkKrCpxAkZ"
+   },
+   "outputs": [],
+   "source": [
+    "# First 3 terms.\n",
+    "nterms = 3\n",
+    "reconstructed = U[:, :nterms] @ xp.diag(S[:nterms]) @ Vt[:nterms, :]\n",
+    "plt.imshow(reconstructed, cmap=\"gray\")\n",
+    "plt.title(\"n = 3\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "qAZDQDIRxAka"
+   },
+   "source": [
+    "That's still pretty fuzzy, so let's check out the image with more terms included:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "v4ihZ_3_7X_P"
+   },
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(16, 4))\n",
+    "\n",
+    "start, end, step = 10, 50, 10\n",
+    "for i in range(start, end, step):\n",
+    "  plt.subplot(1, (end - start) // step + 1, (i - start) // step + 1)\n",
+    "  reconstructed = U[:, :i] @ xp.diag(S[:i]) @ Vt[:i, :]\n",
+    "  plt.imshow(reconstructed, cmap=\"gray\")\n",
+    "  plt.title(f\"n = {i}\")\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "weLC3cKaxAkc"
+   },
+   "source": [
+    "**EXTRA CREDIT: After you port this for loop to CuPy, consider the flow of compute and I/O. Are there any problems with this pattern? How could it be improved?**\n",
+    "\n",
+    "Now we'll print the compression ratio for the values of `n` used above.  This is the number of bytes of the reduced arrays added together and divided by the size of the original grayscale image array.  It seems we can get significant storage savings with this technique."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Pu2lao_KxAkd"
+   },
+   "outputs": [],
+   "source": [
+    "for i in range(start, end, step):\n",
+    "  compress_ratio = (U[:, :i].nbytes + S[:i].nbytes + Vt[:i, :].nbytes) / image.nbytes\n",
+    "  print(f\"n = {i}: compression = {compress_ratio:.1%}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "W1EiUWXKxAkf"
+   },
+   "source": [
+    "Next, we compute and display the difference in the reconstruction for `n = 10` and the original image using `cmap=\"coolwarm\"` to display the difference."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "dPUpGb37xAkg"
+   },
+   "outputs": [],
+   "source": [
+    "delta = image - (U[:,:10] @ xp.diag(S[:10]) @ Vt[:10,:])\n",
+    "plt.imshow(delta, cmap=\"coolwarm\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "5ypywm1uxAkh"
+   },
+   "source": [
+    "Now that you have gotten SVD to work on CuPy, let's benchmark it!  To make things clearer, let's reimport NumPy and CuPy with their usual abbreviations:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "0AjBKJuExAki"
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import cupy as cp\n",
+    "import cupyx as cpx # For `cupyx.profiler.benchmark`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "QJGruJIzxAki"
+   },
+   "source": [
+    "We have to be very careful when benchmarking GPU code. As GPU programming is inherently asynchronous, so it can be tricky to make sure we're measuring the right thing.\n",
+    "\n",
+    "Imagine you're measuring how long it takes to ship a package to someone, but you only time how long it takes for you to drop it off at the post office, not how long it takes for them to receive it and send you a thank you.\n",
+    "\n",
+    "Common Pythonic benchmarking tools like `%timeit` are not GPU aware, so it's easy to measure incorrectly with them.  We can only use them when we know the code we're benchmarking will perform the proper synchronization.  It's better to use something like [`cupyx.profiler.benchmark`](https://docs.cupy.dev/en/stable/reference/generated/cupyx.profiler.benchmark.html#cupyx.profiler.benchmark).\n",
+    "\n",
+    "First, we need a NumPy (CPU) and CuPy (GPU) copy of our image:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "fr2zfGYIxAkj"
+   },
+   "outputs": [],
+   "source": [
+    "cpu_image = cv2.imread('loonie.jpg', cv2.IMREAD_GRAYSCALE)\n",
+    "gpu_image = cp.asarray(cpu_image)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "WNb_Jv88xAkl"
+   },
+   "source": [
+    "Next let's benchmark both CPU and GPU execution:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "WwnvkVsnxAkm"
+   },
+   "outputs": [],
+   "source": [
+    "repeat = 10\n",
+    "warmup = 1\n",
+    "D_np = cpx.profiler.benchmark(n_repeat=repeat, n_warmup=warmup, func=lambda:\n",
+    "  np.linalg.svd(cpu_image, full_matrices=False)\n",
+    ").cpu_times\n",
+    "D_cp = cpx.profiler.benchmark(n_repeat=repeat, n_warmup=warmup, func=lambda:\n",
+    "  cp.linalg.svd(gpu_image, full_matrices=False)\n",
+    ").cpu_times\n",
+    "\n",
+    "print(f\"SVD (Host)   = {D_np.mean():.3g} s ± {(D_np.std() / D_np.mean()):.2%} (mean ± relative stdev of {D_np.size} runs)\")\n",
+    "print(f\"SVD (Device) = {D_cp.mean():.3g} s ± {(D_cp.std() / D_cp.mean()):.2%} (mean ± relative stdev of {D_cp.size} runs)\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "TE6qPht1xAkm"
+   },
+   "source": [
+    "Depending on your hardware, the CPU and GPU might be close to the same speed, or the GPU might even be slower!  This is because the image is not big enough to fully utilize the GPU.  We can simulate a larger image by tiling the image using `np.tile`.  This duplicates the image both along axis 0 and axis 1:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "P-Hl0D5BxAkn"
+   },
+   "outputs": [],
+   "source": [
+    "cpu_image_tile = np.tile(cpu_image, (2, 2))\n",
+    "gpu_image_tile = cp.asarray(cpu_image_tile)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "0pUnjfzYxAkt"
+   },
+   "source": [
+    "Now we can benchmark again (this will take longer because the matrices are much bigger):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "DgilXuzbxAku"
+   },
+   "outputs": [],
+   "source": [
+    "repeat = 5\n",
+    "warmup = 1\n",
+    "D_np = cpx.profiler.benchmark(n_repeat=repeat, n_warmup=warmup, func=lambda:\n",
+    "  np.linalg.svd(cpu_image_tile, full_matrices=False)\n",
+    ").cpu_times\n",
+    "D_cp = cpx.profiler.benchmark(n_repeat=repeat, n_warmup=warmup, func=lambda:\n",
+    "  cp.linalg.svd(gpu_image_tile, full_matrices=False)\n",
+    ").cpu_times\n",
+    "\n",
+    "print(f\"SVD (Host)   = {D_np.mean():.3g} s ± {(D_np.std() / D_np.mean()):.2%} (mean ± relative stdev of {D_np.size} runs)\")\n",
+    "print(f\"SVD (Device) = {D_cp.mean():.3g} s ± {(D_cp.std() / D_cp.mean()):.2%} (mean ± relative stdev of {D_cp.size} runs)\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "5nlgOqkBxAkw"
+   },
+   "source": [
+    "**TODO: Experiment with differ sizes of image by changing the `np.tile` arguments.  When is the GPU faster?**"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
 }
diff --git a/tutorials/accelerated-python/notebooks/fundamentals/05__memory_spaces__power_iteration.ipynb b/tutorials/accelerated-python/notebooks/fundamentals/05__memory_spaces__power_iteration.ipynb
index 3b367a5b..98569d28 100644
--- a/tutorials/accelerated-python/notebooks/fundamentals/05__memory_spaces__power_iteration.ipynb
+++ b/tutorials/accelerated-python/notebooks/fundamentals/05__memory_spaces__power_iteration.ipynb
@@ -529,12 +529,30 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 5
 }
diff --git a/tutorials/accelerated-python/notebooks/fundamentals/06__asynchrony__power_iteration.ipynb b/tutorials/accelerated-python/notebooks/fundamentals/06__asynchrony__power_iteration.ipynb
index 8745d64a..2d6434b5 100644
--- a/tutorials/accelerated-python/notebooks/fundamentals/06__asynchrony__power_iteration.ipynb
+++ b/tutorials/accelerated-python/notebooks/fundamentals/06__asynchrony__power_iteration.ipynb
@@ -1,426 +1,439 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "8KP1pYqmtXdr"
-      },
-      "source": [
-        "# Asynchrony and Power Iteration\n",
-        "\n",
-        "## Table of Contents\n",
-        "1. [Introduction and Setup](#1-Introduction-and-Setup)\n",
-        "   - [1.1 Environment Setup](#11-Environment-Setup)\n",
-        "2. [Theory: Streams and Synchronization](#2-Theory:-Streams-and-Synchronization)\n",
-        "3. [The Baseline Implementation](#3-The-Baseline-Implementation)\n",
-        "4. [Profiling the Baseline](#4-Profiling-the-Baseline)\n",
-        "5. [Better Visibility with NVTX](#5-Better-Visibility-with-NVTX)\n",
-        "6. [Implementing Asynchrony](#6-Implementing-Asynchrony)\n",
-        "7. [Performance Analysis](#7-Performance-Analysis)\n",
-        "\n",
-        "## 1. Introduction and Setup\n",
-        "\n",
-        "GPU programming is inherently asynchronous. In this exercise, we will explore the implications of this behavior when using CuPy and learn how to analyze the flow of execution using profiling tools.\n",
-        "\n",
-        "We will revisit the Power Iteration algorithm. Our goal is to take a standard implementation, profile it to identify bottlenecks caused by implicit synchronization, and then optimize it using CUDA streams and asynchronous memory transfers.\n",
-        "\n",
-        "### 1.1 Environment Setup\n",
-        "\n",
-        "First, we need to ensure the Nsight Systems profiler (nsys), Nsightful, and NVTX are installed and available."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "rO4kOPuP_0JG"
-      },
-      "outputs": [],
-      "source": [
-        "import os\n",
-        "\n",
-        "# Install necessary tools if running in Google Colab\n",
-        "if os.getenv(\"COLAB_RELEASE_TAG\"):\n",
-        "  !curl -s -L -O https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/NsightSystems-linux-cli-public-2025.3.1.90-3582212.deb\n",
-        "  !sudo dpkg -i NsightSystems-linux-cli-public-2025.3.1.90-3582212.deb > /dev/null\n",
-        "  !pip install \"nvtx\" \"nsightful[notebook] @ git+https://github.com/brycelelbach/nsightful.git\" > /dev/null 2>&1\n",
-        "\n",
-        "print(\"Environment setup complete.\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "6sUvjAtMxI3h"
-      },
-      "source": [
-        "## 2. Theory: Streams and Synchronization\n",
-        "\n",
-        "All GPU work is launched asynchronously on a stream. The work items in a stream are executed in order. If you launch `f` on a stream and later launch `g` on that same stream, then `f` will be executed before `g`. But if `f` and `g` are launched on different streams, then their execution might overlap.\n",
-        "\n",
-        "**How CuPy handles this:**\n",
-        "\n",
-        "- **Default Stream:** Unless specified, CuPy launches work on the default CUDA stream.\n",
-        "\n",
-        "- **Sequential Device Execution:** By default, CuPy work executes sequentially on the GPU.\n",
-        "\n",
-        "- **Asynchronous Host Execution:** From the Python (Host) perspective, the code often returns immediately after launching the GPU kernel, before the work is actually finished.\n",
-        "\n",
-        "**TODO:** Even though CuPy is asynchronous, certain operations force the CPU to wait for the GPU to finish. What operations do you think implicitly synchronize the host and device?\n",
-        "\n",
-        "## 3. The Baseline Implementation\n",
-        "\n",
-        "We will start with a baseline implementation of the Power Iteration algorithm.\n",
-        "\n",
-        "**Note:** The cell below writes the code to a file named `power_iteration__baseline.py`. We do this because we must run the code through the Nsight Systems profiler via the command line."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "sEicxhLO_9G9"
-      },
-      "outputs": [],
-      "source": [
-        "%%writefile power_iteration__baseline.py\n",
-        "\n",
-        "import numpy as np\n",
-        "import cupy as cp\n",
-        "import cupyx as cpx\n",
-        "import nvtx\n",
-        "from dataclasses import dataclass\n",
-        "\n",
-        "@dataclass\n",
-        "class PowerIterationConfig:\n",
-        "  dim: int = 8192\n",
-        "  dominance: float = 0.05\n",
-        "  max_steps: int = 1000\n",
-        "  check_frequency: int = 10\n",
-        "  progress: bool = True\n",
-        "  residual_threshold: float = 1e-10\n",
-        "\n",
-        "def generate_device(cfg=PowerIterationConfig()):\n",
-        "  cp.random.seed(42)\n",
-        "  weak_lam = cp.random.random(cfg.dim - 1) * (1.0 - cfg.dominance)\n",
-        "  lam = cp.random.permutation(cp.concatenate((cp.asarray([1.0]), weak_lam)))\n",
-        "  P = cp.random.random((cfg.dim, cfg.dim))\n",
-        "  D = cp.diag(cp.random.permutation(lam))\n",
-        "  A = ((P @ D) @ cp.linalg.inv(P))\n",
-        "  return A\n",
-        "\n",
-        "def estimate_device(A, cfg=PowerIterationConfig()):\n",
-        "  # If `A` is on the host, copy from host to device. Otherwise, does nothing.\n",
-        "  A_gpu = cp.asarray(A)\n",
-        "\n",
-        "  x = cp.ones(A_gpu.shape[0], dtype=np.float64)\n",
-        "\n",
-        "  for i in range(0, cfg.max_steps, cfg.check_frequency):\n",
-        "    y = A_gpu @ x\n",
-        "    lam = (x @ y) / (x @ x)            # Rayleigh quotient.\n",
-        "    res = cp.linalg.norm(y - lam * x)\n",
-        "    x = y / cp.linalg.norm(y)          # Normalize for next step.\n",
-        "\n",
-        "    if cfg.progress:\n",
-        "      print(f\"step {i}: residual = {res:.3e}\")\n",
-        "\n",
-        "    # Copy from device to host and save a checkpoint.\n",
-        "    np.savetxt(f\"device_{i}.txt\", cp.asnumpy(x))\n",
-        "\n",
-        "    if res < cfg.residual_threshold:\n",
-        "      break\n",
-        "\n",
-        "    for _ in range(cfg.check_frequency - 1):\n",
-        "      y = A_gpu @ x # We have to use `A_gpu` here as well.\n",
-        "      x = y / cp.linalg.norm(y) # Normalize for next step.\n",
-        "\n",
-        "  # Copy from device to host.\n",
-        "  return cp.asnumpy((x.T @ (A_gpu @ x)) / (x.T @ x))\n",
-        "\n",
-        "A_device = generate_device()\n",
-        "\n",
-        "# Warmup to ensure modules are loaded and code is JIT compiled before timing.\n",
-        "estimate_device(A_device, cfg=PowerIterationConfig(progress=False))\n",
-        "\n",
-        "start = cp.cuda.get_current_stream().record()\n",
-        "lam_est_device = estimate_device(A_device).item()\n",
-        "stop = cp.cuda.get_current_stream().record()\n",
-        "\n",
-        "duration = cp.cuda.get_elapsed_time(start, stop) / 1e3\n",
-        "\n",
-        "print()\n",
-        "print(f\"GPU Execution Time: {duration:.3f} s\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "1lyhHnzrdXzI"
-      },
-      "source": [
-        "## 4. Profiling the Baseline\n",
-        "\n",
-        "Now let's profile our code by running it under the Nsight Systems `nsys` tool. The syntax for this is `nsys <nsys flags> <your program> <your program args>`. It will run your program while collecting a birdseye view of everything going on in your program."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "1HU5p1IhAkTA"
-      },
-      "outputs": [],
-      "source": [
-        "!nsys profile --cuda-event-trace=false --force-overwrite true -o power_iteration__baseline python power_iteration__baseline.py"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "IlGIAIEPe3SV"
-      },
-      "source": [
-        "Now let's view our report and explore what's going on in our program.\n",
-        "\n",
-        "**TODO:** Run the next cell, which will generate the report and create a button that when clicked will open it up in Perfetto, a web-based no-install visual profiler.\n",
-        "\n",
-        "**EXTRA CREDIT:** Download the Nsight Systems GUI and open the report in it to see even more information."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "s6VVOnGQR3Ph"
-      },
-      "outputs": [],
-      "source": [
-        "import nsightful\n",
-        "\n",
-        "!nsys export --type sqlite --quiet true --force-overwrite true power_iteration__baseline.nsys-rep\n",
-        "nsightful.display_nsys_sqlite_file_in_notebook(\"power_iteration__baseline.sqlite\", title=\"Power Iteration - Baseline\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "bGxz6-spplcU"
-      },
-      "source": [
-        "## 5. Better Visibility with NVTX\n",
-        "\n",
-        "Nsight Systems shows us a lot of information - sometimes it's too much and not all relevant.\n",
-        "\n",
-        "There's two ways that we can filter and annotate what we see in Nsight systems.\n",
-        "\n",
-        "The first is to limit when we start and stop profiling in the program. In Python, we can do this with `cupyx.profiler.profile()`, which give us a Python context manager. Any CUDA code used during scope will be included in the profile.\n",
-        "\n",
-        "```\n",
-        "not_in_the profile()\n",
-        "with cpx.profiler.profile():\n",
-        "  in_the_profile()\n",
-        "not_in_the_profile()\n",
-        "```\n",
-        "\n",
-        "For this to work, we have to pass `--capture-range=cudaProfilerApi --capture-range-end=stop` as flags to `nsys`.\n",
-        "\n",
-        "We can also annotate specific regions of our code, which will show up in the profiler. We can even add categories, domains, and colors to these regions, and they can be nested. To add these annotations, we use `nvtx.annnotate()`, another Python context manager, this time from a library called NVTX.\n",
-        "\n",
-        "```\n",
-        "with nvtx.annotate(\"Loop\")\n",
-        "  for i in range(20):\n",
-        "     with nvtx.annotate(f\"Step {i}\"):\n",
-        "       pass\n",
-        "```\n",
-        "\n",
-        "**TODO:** Go back to the earlier cells and improve the profile results by adding:\n",
-        "\n",
-        "- `nvtx.annotate()` regions. Remember, you can nest them.\n",
-        "\n",
-        "- A `cpx.profiler.profile()` around the `start =`/`stop =` lines that run the solver.\n",
-        "\n",
-        "- `--capture-range=cudaProfilerApi --capture-range-end=stop` to the `nsys` flags.\n",
-        "\n",
-        "Then, capture another profile and see if you can identify how we can improve the code. Specifically, think about how we could add more asynchrony."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "PF7PUALVfX3A"
-      },
-      "source": [
-        "## 6. Implementing Asynchrony\n",
-        "\n",
-        "Remember what we've learned about streams and how to use them with CuPy:\n",
-        "\n",
-        "- By default, all CuPy operations within a single thread run on the same stream. You can access this stream with `cp.cuda.get_current_stream()`.\n",
-        "\n",
-        "- You can create a new stream with `cp.cuda.Stream(non_blocking=True)`. Use `with` statements to use the stream for all CuPy operations within a block.\n",
-        "\n",
-        "- You can record an event on a stream by calling `.record()` on it.\n",
-        "\n",
-        "- You can synchronize on an event (or an entire stream) by calling `.synchronize()` on it.\n",
-        "\n",
-        "- Memory transfers will block by default. You can launch them asynchronously with `cp.asarray(..., blocking=False)` (for host to device transfers) and `cp.asnumpy(..., blocking=False)` (for device to host transfers).\n",
-        "\n",
-        "**TODO:** Copy the kernel from the earlier cell with your NVTX and CuPy profiler regions into the cell below. Then, try to improve performance by adding asynchrony. Make sure that you don't copy and paste the `%%writefile` directive."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "%%writefile power_iteration__async.py\n",
-        "\n",
-        "import numpy as np\n",
-        "import cupy as cp\n",
-        "import cupyx as cpx\n",
-        "import nvtx\n",
-        "from dataclasses import dataclass\n",
-        "\n",
-        "@dataclass\n",
-        "class PowerIterationConfig:\n",
-        "  dim: int = 8192\n",
-        "  dominance: float = 0.05\n",
-        "  max_steps: int = 1000\n",
-        "  check_frequency: int = 10\n",
-        "  progress: bool = True\n",
-        "  residual_threshold: float = 1e-10\n",
-        "\n",
-        "def generate_device(cfg=PowerIterationConfig()):\n",
-        "  cp.random.seed(42)\n",
-        "  weak_lam = cp.random.random(cfg.dim - 1) * (1.0 - cfg.dominance)\n",
-        "  lam = cp.random.permutation(cp.concatenate((cp.asarray([1.0]), weak_lam)))\n",
-        "  P = cp.random.random((cfg.dim, cfg.dim))\n",
-        "  D = cp.diag(cp.random.permutation(lam))\n",
-        "  A = ((P @ D) @ cp.linalg.inv(P))\n",
-        "  return A\n",
-        "\n",
-        "def estimate_device(A, cfg=PowerIterationConfig()):\n",
-        "  raise NotImplementedError(\"TODO: You need to implement this kernel!\")\n",
-        "\n",
-        "A_device = generate_device()\n",
-        "\n",
-        "# Warmup to ensure modules are loaded and code is JIT compiled before timing.\n",
-        "estimate_device(A_device, cfg=PowerIterationConfig(progress=False))\n",
-        "\n",
-        "start = cp.cuda.get_current_stream().record()\n",
-        "lam_est_device = estimate_device(A_device).item()\n",
-        "stop = cp.cuda.get_current_stream().record()\n",
-        "\n",
-        "duration = cp.cuda.get_elapsed_time(start, stop) / 1e3\n",
-        "\n",
-        "print()\n",
-        "print(f\"GPU Execution Time: {duration:.3f} s\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "Now let's make sure it works:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "pszz-k8cDfqy"
-      },
-      "outputs": [],
-      "source": [
-        "!python power_iteration__async.py"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "VFCYIqwaKYqy"
-      },
-      "source": [
-        "## 7. Performance Analysis\n",
-        "\n",
-        "Before we profile the improved code, let's compare the execution times of both."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "uSPFNIb9KcPb"
-      },
-      "outputs": [],
-      "source": [
-        "power_iteration_baseline_output   = !python power_iteration__baseline.py\n",
-        "power_iteration_baseline_duration = float(power_iteration_baseline_output[-1].split()[-2])\n",
-        "power_iteration_async_output      = !python power_iteration__async.py\n",
-        "power_iteration_async_duration    = float(power_iteration_async_output[-1].split()[-2])\n",
-        "speedup = power_iteration_baseline_duration / power_iteration_async_duration\n",
-        "\n",
-        "print(f\"GPU Execution Time\")\n",
-        "print()\n",
-        "print(f\"power_iteration_baseline: {power_iteration_baseline_duration:.3f} s\")\n",
-        "print(f\"power_iteration_async:    {power_iteration_async_duration:.3f} s\")\n",
-        "print(f\"power_iteration_async speedup over power_iteration_baseline: {speedup:.2f}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "o4WJVFBkkRaN"
-      },
-      "source": [
-        "Next, let's capture a profile report of our improved code."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "BtQR4CHikWFK"
-      },
-      "outputs": [],
-      "source": [
-        "!nsys profile --cuda-event-trace=false --capture-range=cudaProfilerApi --capture-range-end=stop --force-overwrite true -o power_iteration__async python power_iteration__async.py"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Pnvne_F4jYTh"
-      },
-      "source": [
-        "Finally, let's look at the profile in Perfetto and confirm we've gotten rid of the idling."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "mWXBvi-hFGhU"
-      },
-      "outputs": [],
-      "source": [
-        "!nsys export --type sqlite --quiet true --force-overwrite true power_iteration__async.nsys-rep\n",
-        "nsightful.display_nsys_sqlite_file_in_notebook(\"power_iteration__async.sqlite\", title=\"Power Iteration - Async Event\")"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "gpuType": "T4",
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
-      "language": "python",
-      "name": "python3"
-    }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "8KP1pYqmtXdr"
+   },
+   "source": [
+    "# Asynchrony and Power Iteration\n",
+    "\n",
+    "## Table of Contents\n",
+    "1. [Introduction and Setup](#1-Introduction-and-Setup)\n",
+    "   - [1.1 Environment Setup](#11-Environment-Setup)\n",
+    "2. [Theory: Streams and Synchronization](#2-Theory:-Streams-and-Synchronization)\n",
+    "3. [The Baseline Implementation](#3-The-Baseline-Implementation)\n",
+    "4. [Profiling the Baseline](#4-Profiling-the-Baseline)\n",
+    "5. [Better Visibility with NVTX](#5-Better-Visibility-with-NVTX)\n",
+    "6. [Implementing Asynchrony](#6-Implementing-Asynchrony)\n",
+    "7. [Performance Analysis](#7-Performance-Analysis)\n",
+    "\n",
+    "## 1. Introduction and Setup\n",
+    "\n",
+    "GPU programming is inherently asynchronous. In this exercise, we will explore the implications of this behavior when using CuPy and learn how to analyze the flow of execution using profiling tools.\n",
+    "\n",
+    "We will revisit the Power Iteration algorithm. Our goal is to take a standard implementation, profile it to identify bottlenecks caused by implicit synchronization, and then optimize it using CUDA streams and asynchronous memory transfers.\n",
+    "\n",
+    "### 1.1 Environment Setup\n",
+    "\n",
+    "First, we need to ensure the Nsight Systems profiler (nsys), Nsightful, and NVTX are installed and available."
+   ]
   },
-  "nbformat": 4,
-  "nbformat_minor": 0
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "rO4kOPuP_0JG"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "# Install necessary tools if running in Google Colab\n",
+    "if os.getenv(\"COLAB_RELEASE_TAG\"):\n",
+    "  !curl -s -L -O https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/NsightSystems-linux-cli-public-2025.3.1.90-3582212.deb\n",
+    "  !sudo dpkg -i NsightSystems-linux-cli-public-2025.3.1.90-3582212.deb > /dev/null\n",
+    "  !pip install \"nvtx\" \"nsightful[notebook] @ git+https://github.com/brycelelbach/nsightful.git\" > /dev/null 2>&1\n",
+    "\n",
+    "print(\"Environment setup complete.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "6sUvjAtMxI3h"
+   },
+   "source": [
+    "## 2. Theory: Streams and Synchronization\n",
+    "\n",
+    "All GPU work is launched asynchronously on a stream. The work items in a stream are executed in order. If you launch `f` on a stream and later launch `g` on that same stream, then `f` will be executed before `g`. But if `f` and `g` are launched on different streams, then their execution might overlap.\n",
+    "\n",
+    "**How CuPy handles this:**\n",
+    "\n",
+    "- **Default Stream:** Unless specified, CuPy launches work on the default CUDA stream.\n",
+    "\n",
+    "- **Sequential Device Execution:** By default, CuPy work executes sequentially on the GPU.\n",
+    "\n",
+    "- **Asynchronous Host Execution:** From the Python (Host) perspective, the code often returns immediately after launching the GPU kernel, before the work is actually finished.\n",
+    "\n",
+    "**TODO:** Even though CuPy is asynchronous, certain operations force the CPU to wait for the GPU to finish. What operations do you think implicitly synchronize the host and device?\n",
+    "\n",
+    "## 3. The Baseline Implementation\n",
+    "\n",
+    "We will start with a baseline implementation of the Power Iteration algorithm.\n",
+    "\n",
+    "**Note:** The cell below writes the code to a file named `power_iteration__baseline.py`. We do this because we must run the code through the Nsight Systems profiler via the command line."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "sEicxhLO_9G9"
+   },
+   "outputs": [],
+   "source": [
+    "%%writefile power_iteration__baseline.py\n",
+    "\n",
+    "import numpy as np\n",
+    "import cupy as cp\n",
+    "import cupyx as cpx\n",
+    "import nvtx\n",
+    "from dataclasses import dataclass\n",
+    "\n",
+    "@dataclass\n",
+    "class PowerIterationConfig:\n",
+    "  dim: int = 8192\n",
+    "  dominance: float = 0.05\n",
+    "  max_steps: int = 1000\n",
+    "  check_frequency: int = 10\n",
+    "  progress: bool = True\n",
+    "  residual_threshold: float = 1e-10\n",
+    "\n",
+    "def generate_device(cfg=PowerIterationConfig()):\n",
+    "  cp.random.seed(42)\n",
+    "  weak_lam = cp.random.random(cfg.dim - 1) * (1.0 - cfg.dominance)\n",
+    "  lam = cp.random.permutation(cp.concatenate((cp.asarray([1.0]), weak_lam)))\n",
+    "  P = cp.random.random((cfg.dim, cfg.dim))\n",
+    "  D = cp.diag(cp.random.permutation(lam))\n",
+    "  A = ((P @ D) @ cp.linalg.inv(P))\n",
+    "  return A\n",
+    "\n",
+    "def estimate_device(A, cfg=PowerIterationConfig()):\n",
+    "  # If `A` is on the host, copy from host to device. Otherwise, does nothing.\n",
+    "  A_gpu = cp.asarray(A)\n",
+    "\n",
+    "  x = cp.ones(A_gpu.shape[0], dtype=np.float64)\n",
+    "\n",
+    "  for i in range(0, cfg.max_steps, cfg.check_frequency):\n",
+    "    y = A_gpu @ x\n",
+    "    lam = (x @ y) / (x @ x)            # Rayleigh quotient.\n",
+    "    res = cp.linalg.norm(y - lam * x)\n",
+    "    x = y / cp.linalg.norm(y)          # Normalize for next step.\n",
+    "\n",
+    "    if cfg.progress:\n",
+    "      print(f\"step {i}: residual = {res:.3e}\")\n",
+    "\n",
+    "    # Copy from device to host and save a checkpoint.\n",
+    "    np.savetxt(f\"device_{i}.txt\", cp.asnumpy(x))\n",
+    "\n",
+    "    if res < cfg.residual_threshold:\n",
+    "      break\n",
+    "\n",
+    "    for _ in range(cfg.check_frequency - 1):\n",
+    "      y = A_gpu @ x # We have to use `A_gpu` here as well.\n",
+    "      x = y / cp.linalg.norm(y) # Normalize for next step.\n",
+    "\n",
+    "  # Copy from device to host.\n",
+    "  return cp.asnumpy((x.T @ (A_gpu @ x)) / (x.T @ x))\n",
+    "\n",
+    "A_device = generate_device()\n",
+    "\n",
+    "# Warmup to ensure modules are loaded and code is JIT compiled before timing.\n",
+    "estimate_device(A_device, cfg=PowerIterationConfig(progress=False))\n",
+    "\n",
+    "start = cp.cuda.get_current_stream().record()\n",
+    "lam_est_device = estimate_device(A_device).item()\n",
+    "stop = cp.cuda.get_current_stream().record()\n",
+    "\n",
+    "duration = cp.cuda.get_elapsed_time(start, stop) / 1e3\n",
+    "\n",
+    "print()\n",
+    "print(f\"GPU Execution Time: {duration:.3f} s\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "1lyhHnzrdXzI"
+   },
+   "source": [
+    "## 4. Profiling the Baseline\n",
+    "\n",
+    "Now let's profile our code by running it under the Nsight Systems `nsys` tool. The syntax for this is `nsys <nsys flags> <your program> <your program args>`. It will run your program while collecting a birdseye view of everything going on in your program."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "1HU5p1IhAkTA"
+   },
+   "outputs": [],
+   "source": [
+    "!nsys profile --cuda-event-trace=false --force-overwrite true -o power_iteration__baseline python power_iteration__baseline.py"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "IlGIAIEPe3SV"
+   },
+   "source": [
+    "Now let's view our report and explore what's going on in our program.\n",
+    "\n",
+    "**TODO:** Run the next cell, which will generate the report and create a button that when clicked will open it up in Perfetto, a web-based no-install visual profiler.\n",
+    "\n",
+    "**EXTRA CREDIT:** Download the Nsight Systems GUI and open the report in it to see even more information."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "s6VVOnGQR3Ph"
+   },
+   "outputs": [],
+   "source": [
+    "import nsightful\n",
+    "\n",
+    "!nsys export --type sqlite --quiet true --force-overwrite true power_iteration__baseline.nsys-rep\n",
+    "nsightful.display_nsys_sqlite_file_in_notebook(\"power_iteration__baseline.sqlite\", title=\"Power Iteration - Baseline\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "bGxz6-spplcU"
+   },
+   "source": [
+    "## 5. Better Visibility with NVTX\n",
+    "\n",
+    "Nsight Systems shows us a lot of information - sometimes it's too much and not all relevant.\n",
+    "\n",
+    "There's two ways that we can filter and annotate what we see in Nsight systems.\n",
+    "\n",
+    "The first is to limit when we start and stop profiling in the program. In Python, we can do this with `cupyx.profiler.profile()`, which give us a Python context manager. Any CUDA code used during scope will be included in the profile.\n",
+    "\n",
+    "```\n",
+    "not_in_the profile()\n",
+    "with cpx.profiler.profile():\n",
+    "  in_the_profile()\n",
+    "not_in_the_profile()\n",
+    "```\n",
+    "\n",
+    "For this to work, we have to pass `--capture-range=cudaProfilerApi --capture-range-end=stop` as flags to `nsys`.\n",
+    "\n",
+    "We can also annotate specific regions of our code, which will show up in the profiler. We can even add categories, domains, and colors to these regions, and they can be nested. To add these annotations, we use `nvtx.annnotate()`, another Python context manager, this time from a library called NVTX.\n",
+    "\n",
+    "```\n",
+    "with nvtx.annotate(\"Loop\")\n",
+    "  for i in range(20):\n",
+    "     with nvtx.annotate(f\"Step {i}\"):\n",
+    "       pass\n",
+    "```\n",
+    "\n",
+    "**TODO:** Go back to the earlier cells and improve the profile results by adding:\n",
+    "\n",
+    "- `nvtx.annotate()` regions. Remember, you can nest them.\n",
+    "\n",
+    "- A `cpx.profiler.profile()` around the `start =`/`stop =` lines that run the solver.\n",
+    "\n",
+    "- `--capture-range=cudaProfilerApi --capture-range-end=stop` to the `nsys` flags.\n",
+    "\n",
+    "Then, capture another profile and see if you can identify how we can improve the code. Specifically, think about how we could add more asynchrony."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "PF7PUALVfX3A"
+   },
+   "source": [
+    "## 6. Implementing Asynchrony\n",
+    "\n",
+    "Remember what we've learned about streams and how to use them with CuPy:\n",
+    "\n",
+    "- By default, all CuPy operations within a single thread run on the same stream. You can access this stream with `cp.cuda.get_current_stream()`.\n",
+    "\n",
+    "- You can create a new stream with `cp.cuda.Stream(non_blocking=True)`. Use `with` statements to use the stream for all CuPy operations within a block.\n",
+    "\n",
+    "- You can record an event on a stream by calling `.record()` on it.\n",
+    "\n",
+    "- You can synchronize on an event (or an entire stream) by calling `.synchronize()` on it.\n",
+    "\n",
+    "- Memory transfers will block by default. You can launch them asynchronously with `cp.asarray(..., blocking=False)` (for host to device transfers) and `cp.asnumpy(..., blocking=False)` (for device to host transfers).\n",
+    "\n",
+    "**TODO:** Copy the kernel from the earlier cell with your NVTX and CuPy profiler regions into the cell below. Then, try to improve performance by adding asynchrony. Make sure that you don't copy and paste the `%%writefile` directive."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile power_iteration__async.py\n",
+    "\n",
+    "import numpy as np\n",
+    "import cupy as cp\n",
+    "import cupyx as cpx\n",
+    "import nvtx\n",
+    "from dataclasses import dataclass\n",
+    "\n",
+    "@dataclass\n",
+    "class PowerIterationConfig:\n",
+    "  dim: int = 8192\n",
+    "  dominance: float = 0.05\n",
+    "  max_steps: int = 1000\n",
+    "  check_frequency: int = 10\n",
+    "  progress: bool = True\n",
+    "  residual_threshold: float = 1e-10\n",
+    "\n",
+    "def generate_device(cfg=PowerIterationConfig()):\n",
+    "  cp.random.seed(42)\n",
+    "  weak_lam = cp.random.random(cfg.dim - 1) * (1.0 - cfg.dominance)\n",
+    "  lam = cp.random.permutation(cp.concatenate((cp.asarray([1.0]), weak_lam)))\n",
+    "  P = cp.random.random((cfg.dim, cfg.dim))\n",
+    "  D = cp.diag(cp.random.permutation(lam))\n",
+    "  A = ((P @ D) @ cp.linalg.inv(P))\n",
+    "  return A\n",
+    "\n",
+    "def estimate_device(A, cfg=PowerIterationConfig()):\n",
+    "  raise NotImplementedError(\"TODO: You need to implement this kernel!\")\n",
+    "\n",
+    "A_device = generate_device()\n",
+    "\n",
+    "# Warmup to ensure modules are loaded and code is JIT compiled before timing.\n",
+    "estimate_device(A_device, cfg=PowerIterationConfig(progress=False))\n",
+    "\n",
+    "start = cp.cuda.get_current_stream().record()\n",
+    "lam_est_device = estimate_device(A_device).item()\n",
+    "stop = cp.cuda.get_current_stream().record()\n",
+    "\n",
+    "duration = cp.cuda.get_elapsed_time(start, stop) / 1e3\n",
+    "\n",
+    "print()\n",
+    "print(f\"GPU Execution Time: {duration:.3f} s\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now let's make sure it works:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "pszz-k8cDfqy"
+   },
+   "outputs": [],
+   "source": [
+    "!python power_iteration__async.py"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "VFCYIqwaKYqy"
+   },
+   "source": [
+    "## 7. Performance Analysis\n",
+    "\n",
+    "Before we profile the improved code, let's compare the execution times of both."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "uSPFNIb9KcPb"
+   },
+   "outputs": [],
+   "source": [
+    "power_iteration_baseline_output   = !python power_iteration__baseline.py\n",
+    "power_iteration_baseline_duration = float(power_iteration_baseline_output[-1].split()[-2])\n",
+    "power_iteration_async_output      = !python power_iteration__async.py\n",
+    "power_iteration_async_duration    = float(power_iteration_async_output[-1].split()[-2])\n",
+    "speedup = power_iteration_baseline_duration / power_iteration_async_duration\n",
+    "\n",
+    "print(f\"GPU Execution Time\")\n",
+    "print()\n",
+    "print(f\"power_iteration_baseline: {power_iteration_baseline_duration:.3f} s\")\n",
+    "print(f\"power_iteration_async:    {power_iteration_async_duration:.3f} s\")\n",
+    "print(f\"power_iteration_async speedup over power_iteration_baseline: {speedup:.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "o4WJVFBkkRaN"
+   },
+   "source": [
+    "Next, let's capture a profile report of our improved code."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "BtQR4CHikWFK"
+   },
+   "outputs": [],
+   "source": [
+    "!nsys profile --cuda-event-trace=false --capture-range=cudaProfilerApi --capture-range-end=stop --force-overwrite true -o power_iteration__async python power_iteration__async.py"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Pnvne_F4jYTh"
+   },
+   "source": [
+    "Finally, let's look at the profile in Perfetto and confirm we've gotten rid of the idling."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "mWXBvi-hFGhU"
+   },
+   "outputs": [],
+   "source": [
+    "!nsys export --type sqlite --quiet true --force-overwrite true power_iteration__async.nsys-rep\n",
+    "nsightful.display_nsys_sqlite_file_in_notebook(\"power_iteration__async.sqlite\", title=\"Power Iteration - Async Event\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
 }
diff --git a/tutorials/accelerated-python/notebooks/fundamentals/07__cuda_core__devices_streams_and_memory.ipynb b/tutorials/accelerated-python/notebooks/fundamentals/07__cuda_core__devices_streams_and_memory.ipynb
index a2962bae..39831e55 100644
--- a/tutorials/accelerated-python/notebooks/fundamentals/07__cuda_core__devices_streams_and_memory.ipynb
+++ b/tutorials/accelerated-python/notebooks/fundamentals/07__cuda_core__devices_streams_and_memory.ipynb
@@ -865,6 +865,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -881,10 +887,6 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.11.7"
-  },
-  "colab": {
-   "provenance": [],
-   "gpuType": "T4"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/accelerated-python/notebooks/fundamentals/solutions/01__numpy_intro__ndarray_basics__SOLUTION.ipynb b/tutorials/accelerated-python/notebooks/fundamentals/solutions/01__numpy_intro__ndarray_basics__SOLUTION.ipynb
index 3f8068d9..d2c0b7b4 100644
--- a/tutorials/accelerated-python/notebooks/fundamentals/solutions/01__numpy_intro__ndarray_basics__SOLUTION.ipynb
+++ b/tutorials/accelerated-python/notebooks/fundamentals/solutions/01__numpy_intro__ndarray_basics__SOLUTION.ipynb
@@ -1,359 +1,372 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "id": "d2e341ff-0c1e-40e8-8c33-9e3039de8013",
-      "metadata": {
-        "id": "d2e341ff-0c1e-40e8-8c33-9e3039de8013"
-      },
-      "source": [
-        "# NumPy `ndarray` Basics - SOLUTIONS"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "a5ba6a1c",
-      "metadata": {},
-      "source": [
-        "## Table of Contents\n",
-        "\n",
-        "1. [The De Facto Standard for Array Data](#1.-The-De-Facto-Standard-for-Array-Data)\n",
-        "2. [Anatomy of an `ndarray`: Structure and Memory](#2.-Anatomy-of-an-`ndarray`:-Structure-and-Memory)\n",
-        "3. [Array Creation and Logical Views (Views vs. Copies)](#3.-Array-Creation-and-Logical-Views-(Views-vs.-Copies))\n",
-        "4. [Aggregations and Axes](#4.-Aggregations-and-Axes)\n",
-        "5. [Broadcasting: The \"Stretch\" Rule](#5.-Broadcasting:-The-\"Stretch\"-Rule)\n",
-        "6. [Why Vectorize? The Speed Advantage](#6.-Why-Vectorize?-The-Speed-Advantage)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "b30427de",
-      "metadata": {},
-      "source": [
-        "## 1. The De Facto Standard for Array Data\n",
-        "\n",
-        "NumPy is the foundational library for High Performance Computing (HPC) and Machine Learning (ML) in Python. Libraries like PyTorch, Pandas, and Scikit-learn are built upon or mirror the NumPy API. Learning NumPy is essential for mastering the Array Programming paradigm.\n",
-        "\n",
-        "NumPy provides the `ndarray` (N-dimensional array), a powerful, high-performance, and uniform container that enables highly efficient memory management, indexing, slicing, and, most importantly, vectorized arithmetic."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 1,
-      "id": "cc4596d8-d9ff-4c66-8822-246c0fc830c7",
-      "metadata": {
-        "id": "cc4596d8-d9ff-4c66-8822-246c0fc830c7"
-      },
-      "outputs": [],
-      "source": [
-        "import numpy as np"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "c59fce80",
-      "metadata": {},
-      "source": [
-        "## 2. Anatomy of an `ndarray`: Structure and Memory\n",
-        "\n",
-        "Unlike a standard Python list, an `ndarray` is a fixed-size, structured block of contiguous memory. Its efficiency comes from these four key, immutable properties:\n",
-        "\n",
-        "- **Data**: A pointer to the memory location holding the elements.\n",
-        "- **dtype**: The data type (e.g., `int32`, `float64`) which is uniform across all elements.\n",
-        "- **Shape**: A tuple defining the size along each dimension (e.g., $(100, 50)$ for 100 rows and 50 columns).\n",
-        "- **Strides**: The number of bytes to step in memory to reach the next element along each dimension—this is how NumPy efficiently handles different shapes and views.\n",
-        "\n",
-        "Let's explore these properties by creating a large dataset.\n",
-        "\n",
-        "---\n",
-        "\n",
-        "**Quick Docs**\n",
-        "- `np.arange(start, stop, step)`: Returns evenly spaced values in the half-open interval $[\\text{start}, \\text{stop})$.\n",
-        "- `arr.nbytes`: Total bytes consumed by the array's elements (in bytes).\n",
-        "- `arr.ndim`: The number of array dimensions (integer).\n",
-        "- `arr.size`: The total number of elements in the array (integer).\n",
-        "- `arr.shape`: The tuple of array dimensions.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "465e35bd",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Use a large number to clearly demonstrate the memory density of ndarrays\n",
-        "N = 50_000_000"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "5f1a613f-bc87-4950-b195-a66bb5bc05d3",
-      "metadata": {
-        "id": "5f1a613f-bc87-4950-b195-a66bb5bc05d3"
-      },
-      "outputs": [],
-      "source": [
-        "# SOLUTION: Create the input data array with the numbers 1 to 50_000_000 (inclusive).\n",
-        "# np.arange generates values within a half-open interval [start, stop), so we use N + 1 as the stop value.\n",
-        "arr = np.arange(1, N + 1)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "50530f2c-29bf-4061-8f84-bc5be00a5622",
-      "metadata": {
-        "id": "50530f2c-29bf-4061-8f84-bc5be00a5622"
-      },
-      "outputs": [],
-      "source": [
-        "# SOLUTION: Calculate how large the array is in GB with nbytes.\n",
-        "# GB is 1e9 bytes. The .nbytes attribute returns the total bytes consumed by the elements.\n",
-        "# Note: This demonstrates that arrays are dense memory blocks, unlike pointer-heavy Python lists.\n",
-        "arr.nbytes / 1e9"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "ffc15dad-e2fd-4b96-8b39-3496519d0656",
-      "metadata": {
-        "id": "ffc15dad-e2fd-4b96-8b39-3496519d0656"
-      },
-      "outputs": [],
-      "source": [
-        "# SOLUTION: How many dimensions does the array have? (ndim)\n",
-        "arr.ndim"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "b15cdf25-eb35-4926-b306-90ffd62b3d28",
-      "metadata": {
-        "id": "b15cdf25-eb35-4926-b306-90ffd62b3d28"
-      },
-      "outputs": [],
-      "source": [
-        "# SOLUTION: How many elements does the array have? (size)\n",
-        "arr.size"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "63887722-c9d7-405e-a019-e75646115541",
-      "metadata": {
-        "id": "63887722-c9d7-405e-a019-e75646115541"
-      },
-      "outputs": [],
-      "source": [
-        "# SOLUTION: What is the shape of the array?\n",
-        "arr.shape"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "f5e58ee4",
-      "metadata": {},
-      "source": [
-        "## 3. Array Creation and Logical Views (Views vs. Copies)\n",
-        "\n",
-        "Arrays can logically represent data in many ways (e.g., 1D signal, 2D image, 4D video batch) independent of the underlying physical memory block.\n",
-        "\n",
-        "A critical performance feature is that operations like transposing or `reshape` often return a **View** instead of a **Copy**. A View only changes the metadata (`shape` and `strides`) without duplicating the physical data, making these operations nearly instantaneous.\n",
-        "\n",
-        "---\n",
-        "\n",
-        "**Quick Docs**\n",
-        "- `np.linspace(start, stop, num)`: Returns `num` evenly spaced samples, calculated over the interval $[\\text{start}, \\text{stop}]$.\n",
-        "- `np.random.default_rng().random(size)`: Returns random floats in $[0.0, 1.0)$. `size` can be a tuple.\n",
-        "- `arr.sort()`: Sorts an array in-place (modifies the original data). Use `np.sort(arr)` to return a sorted copy.\n",
-        "- `arr.reshape(new_shape)`: Returns a View with a new shape. One dimension can be -1, instructing NumPy to calculate the size automatically.\n",
-        "- `np.resize(arr, new_shape)`: Returns a new array with the specified shape. If the new shape is larger, it fills the new elements by repeating the original array.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "1527b4f6-5d75-47d4-97e0-d0e78bbc59f9",
-      "metadata": {
-        "id": "1527b4f6-5d75-47d4-97e0-d0e78bbc59f9"
-      },
-      "outputs": [],
-      "source": [
-        "# SOLUTION: Create a new array with 5_000_000 elements containing equally spaced values between 0 to 1000 (inclusive).\n",
-        "# np.linspace returns 'num' evenly spaced samples over the closed interval [start, stop].\n",
-        "arr = np.linspace(0, 1000, 5_000_000)\n",
-        "arr"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "2f51aa2e-b994-4a91-aed6-4a4632eb7050",
-      "metadata": {
-        "id": "2f51aa2e-b994-4a91-aed6-4a4632eb7050"
-      },
-      "outputs": [],
-      "source": [
-        "# SOLUTION: Create a random array that is 10_000 rows by 5_000 columns.\n",
-        "# np.random.default_rng().random(size) returns random floats in [0.0, 1.0). size can be a tuple.\n",
-        "arr = np.random.default_rng().random((10_000, 5_000))\n",
-        "arr"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "4ec06270-6e08-4cce-9385-9dc8b53e95fd",
-      "metadata": {
-        "id": "4ec06270-6e08-4cce-9385-9dc8b53e95fd"
-      },
-      "outputs": [],
-      "source": [
-        "# SOLUTION: Sort that array (in-place).\n",
-        "# Note: arr.sort() modifies the array directly, which is typically faster than creating a copy.\n",
-        "arr.sort()"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "cdde560b-5ba6-484c-a601-00b7ef71273d",
-      "metadata": {
-        "id": "cdde560b-5ba6-484c-a601-00b7ef71273d"
-      },
-      "outputs": [],
-      "source": [
-        "# SOLUTION: Reshape the array to have the last dimension of length 5. \n",
-        "# Using -1 lets NumPy automatically calculate the first dimension.\n",
-        "# .reshape() returns a View (not a copy) when possible, so no data is duplicated.\n",
-        "arr_new = arr.reshape(-1, 5)\n",
-        "arr_new"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "54982876",
-      "metadata": {},
-      "source": [
-        "## 4. Aggregations and Axes\n",
-        "\n",
-        "When performing aggregations (like `sum`, `mean`, `max`), you must specify the **Axis** you want to collapse (or reduce) the array along.\n",
-        "\n",
-        "- **Axis 0**: The first dimension (often rows in 2D). Aggregating across Axis 0 produces a result for each column.\n",
-        "- **Axis 1**: The second dimension (often columns in 2D). Aggregating across Axis 1 produces a result for each row.\n",
-        "\n",
-        "---\n",
-        "\n",
-        "**Quick Docs**\n",
-        "- `np.sum(a, axis=None)`: Sum of array elements over a given axis.\n",
-        "  - `axis=0`: Collapse the rows (sum vertical columns).\n",
-        "  - `axis=1`: Collapse the columns (sum horizontal rows).\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "44dd3ac2-c9b7-4327-ba63-860b074c0583",
-      "metadata": {
-        "id": "44dd3ac2-c9b7-4327-ba63-860b074c0583"
-      },
-      "outputs": [],
-      "source": [
-        "# SOLUTION: Find the sum of each row in the reshaped array (arr_new) above.\n",
-        "# To sum the row's content, we reduce across the columns (axis=1).\n",
-        "# axis=1 collapses the second dimension (columns), leaving one sum per row.\n",
-        "arr_sum = np.sum(arr_new, axis=1)\n",
-        "arr_sum"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "ed072cee",
-      "metadata": {},
-      "source": [
-        "## 5. Broadcasting: The \"Stretch\" Rule\n",
-        "\n",
-        "Broadcasting is NumPy's mechanism for performing arithmetic between arrays of different shapes. If dimensions don't match, NumPy attempts to \"stretch\" the smaller array to match the larger one.\n",
-        "\n",
-        "**The Compatibility Rule:** Two dimensions are compatible when:\n",
-        "1. They are equal, or\n",
-        "2. One of them is 1.\n",
-        "\n",
-        "If a dimension is 1, NumPy logically copies that single value across the dimension to match the other array's shape **without allocating any new memory**.\n",
-        "\n",
-        "---\n",
-        "\n",
-        "**Quick Docs**\n",
-        "- **Arithmetic Operators** (`/`, `*`, `+`, `-`): These operate element-wise. Broadcasting occurs if shapes are different but compatible.\n",
-        "- `np.allclose(a, b)`: Returns `True` if two floating-point arrays are element-wise equal within a tolerance. Essential for comparisons instead of using `==`.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "b15342af-2916-481a-9724-9874acf4ed24",
-      "metadata": {
-        "id": "b15342af-2916-481a-9724-9874acf4ed24"
-      },
-      "outputs": [],
-      "source": [
-        "# SOLUTION: Normalize each row of the 2D array (arr_new) by dividing by the sum you just computed (arr_sum).\n",
-        "# 'arr_new' has shape (M, N) and 'arr_sum' has shape (M,).\n",
-        "# To successfully divide, we reshape 'arr_sum' to (M, 1) so broadcasting can stretch it across the N columns.\n",
-        "# Alternative approaches: arr_sum[:, np.newaxis] or arr_sum[:, None] also work.\n",
-        "arr_normalized = arr_new / arr_sum.reshape(-1, 1)\n",
-        "arr_normalized"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "b04622b8-c6de-4756-8a56-e3d2835a5eaf",
-      "metadata": {
-        "id": "b04622b8-c6de-4756-8a56-e3d2835a5eaf"
-      },
-      "outputs": [],
-      "source": [
-        "# SOLUTION (EXTRA CREDIT): Prove that your normalized array is actually normalized.\n",
-        "# If normalized correctly, the sum of every row should now be 1.0.\n",
-        "# We compute row sums (axis=1) and check if they are all close to 1.0 using np.allclose.\n",
-        "np.allclose(np.sum(arr_normalized, axis=1), 1.0)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "31657dd2",
-      "metadata": {},
-      "source": [
-        "## 6. Why Vectorize? The Speed Advantage\n",
-        "\n",
-        "The entire Array Programming paradigm hinges on **Vectorization**.\n",
-        "\n",
-        "Why use complex shapes and broadcasting instead of simple Python `for` loops?\n",
-        "\n",
-        "NumPy's array functions are implemented in highly optimized native code (C/C++, Fortran). An operation like `A + A**2`, where `A` is a massive `ndarray`, is often $\\mathbf{100\\times}$ faster than performing the equivalent element-wise operation using explicit Python loops.\n",
-        "\n",
-        "**Always choose a vectorized NumPy function or operator over a manual Python loop.**"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "gpuType": "T4",
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
-      "language": "python",
-      "name": "python3"
-    }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "d2e341ff-0c1e-40e8-8c33-9e3039de8013",
+   "metadata": {
+    "id": "d2e341ff-0c1e-40e8-8c33-9e3039de8013"
+   },
+   "source": [
+    "# NumPy `ndarray` Basics - SOLUTIONS"
+   ]
   },
-  "nbformat": 4,
-  "nbformat_minor": 5
+  {
+   "cell_type": "markdown",
+   "id": "a5ba6a1c",
+   "metadata": {},
+   "source": [
+    "## Table of Contents\n",
+    "\n",
+    "1. [The De Facto Standard for Array Data](#1.-The-De-Facto-Standard-for-Array-Data)\n",
+    "2. [Anatomy of an `ndarray`: Structure and Memory](#2.-Anatomy-of-an-`ndarray`:-Structure-and-Memory)\n",
+    "3. [Array Creation and Logical Views (Views vs. Copies)](#3.-Array-Creation-and-Logical-Views-(Views-vs.-Copies))\n",
+    "4. [Aggregations and Axes](#4.-Aggregations-and-Axes)\n",
+    "5. [Broadcasting: The \"Stretch\" Rule](#5.-Broadcasting:-The-\"Stretch\"-Rule)\n",
+    "6. [Why Vectorize? The Speed Advantage](#6.-Why-Vectorize?-The-Speed-Advantage)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b30427de",
+   "metadata": {},
+   "source": [
+    "## 1. The De Facto Standard for Array Data\n",
+    "\n",
+    "NumPy is the foundational library for High Performance Computing (HPC) and Machine Learning (ML) in Python. Libraries like PyTorch, Pandas, and Scikit-learn are built upon or mirror the NumPy API. Learning NumPy is essential for mastering the Array Programming paradigm.\n",
+    "\n",
+    "NumPy provides the `ndarray` (N-dimensional array), a powerful, high-performance, and uniform container that enables highly efficient memory management, indexing, slicing, and, most importantly, vectorized arithmetic."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "cc4596d8-d9ff-4c66-8822-246c0fc830c7",
+   "metadata": {
+    "id": "cc4596d8-d9ff-4c66-8822-246c0fc830c7"
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c59fce80",
+   "metadata": {},
+   "source": [
+    "## 2. Anatomy of an `ndarray`: Structure and Memory\n",
+    "\n",
+    "Unlike a standard Python list, an `ndarray` is a fixed-size, structured block of contiguous memory. Its efficiency comes from these four key, immutable properties:\n",
+    "\n",
+    "- **Data**: A pointer to the memory location holding the elements.\n",
+    "- **dtype**: The data type (e.g., `int32`, `float64`) which is uniform across all elements.\n",
+    "- **Shape**: A tuple defining the size along each dimension (e.g., $(100, 50)$ for 100 rows and 50 columns).\n",
+    "- **Strides**: The number of bytes to step in memory to reach the next element along each dimension—this is how NumPy efficiently handles different shapes and views.\n",
+    "\n",
+    "Let's explore these properties by creating a large dataset.\n",
+    "\n",
+    "---\n",
+    "\n",
+    "**Quick Docs**\n",
+    "- `np.arange(start, stop, step)`: Returns evenly spaced values in the half-open interval $[\\text{start}, \\text{stop})$.\n",
+    "- `arr.nbytes`: Total bytes consumed by the array's elements (in bytes).\n",
+    "- `arr.ndim`: The number of array dimensions (integer).\n",
+    "- `arr.size`: The total number of elements in the array (integer).\n",
+    "- `arr.shape`: The tuple of array dimensions.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "465e35bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Use a large number to clearly demonstrate the memory density of ndarrays\n",
+    "N = 50_000_000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f1a613f-bc87-4950-b195-a66bb5bc05d3",
+   "metadata": {
+    "id": "5f1a613f-bc87-4950-b195-a66bb5bc05d3"
+   },
+   "outputs": [],
+   "source": [
+    "# SOLUTION: Create the input data array with the numbers 1 to 50_000_000 (inclusive).\n",
+    "# np.arange generates values within a half-open interval [start, stop), so we use N + 1 as the stop value.\n",
+    "arr = np.arange(1, N + 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "50530f2c-29bf-4061-8f84-bc5be00a5622",
+   "metadata": {
+    "id": "50530f2c-29bf-4061-8f84-bc5be00a5622"
+   },
+   "outputs": [],
+   "source": [
+    "# SOLUTION: Calculate how large the array is in GB with nbytes.\n",
+    "# GB is 1e9 bytes. The .nbytes attribute returns the total bytes consumed by the elements.\n",
+    "# Note: This demonstrates that arrays are dense memory blocks, unlike pointer-heavy Python lists.\n",
+    "arr.nbytes / 1e9"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ffc15dad-e2fd-4b96-8b39-3496519d0656",
+   "metadata": {
+    "id": "ffc15dad-e2fd-4b96-8b39-3496519d0656"
+   },
+   "outputs": [],
+   "source": [
+    "# SOLUTION: How many dimensions does the array have? (ndim)\n",
+    "arr.ndim"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b15cdf25-eb35-4926-b306-90ffd62b3d28",
+   "metadata": {
+    "id": "b15cdf25-eb35-4926-b306-90ffd62b3d28"
+   },
+   "outputs": [],
+   "source": [
+    "# SOLUTION: How many elements does the array have? (size)\n",
+    "arr.size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63887722-c9d7-405e-a019-e75646115541",
+   "metadata": {
+    "id": "63887722-c9d7-405e-a019-e75646115541"
+   },
+   "outputs": [],
+   "source": [
+    "# SOLUTION: What is the shape of the array?\n",
+    "arr.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5e58ee4",
+   "metadata": {},
+   "source": [
+    "## 3. Array Creation and Logical Views (Views vs. Copies)\n",
+    "\n",
+    "Arrays can logically represent data in many ways (e.g., 1D signal, 2D image, 4D video batch) independent of the underlying physical memory block.\n",
+    "\n",
+    "A critical performance feature is that operations like transposing or `reshape` often return a **View** instead of a **Copy**. A View only changes the metadata (`shape` and `strides`) without duplicating the physical data, making these operations nearly instantaneous.\n",
+    "\n",
+    "---\n",
+    "\n",
+    "**Quick Docs**\n",
+    "- `np.linspace(start, stop, num)`: Returns `num` evenly spaced samples, calculated over the interval $[\\text{start}, \\text{stop}]$.\n",
+    "- `np.random.default_rng().random(size)`: Returns random floats in $[0.0, 1.0)$. `size` can be a tuple.\n",
+    "- `arr.sort()`: Sorts an array in-place (modifies the original data). Use `np.sort(arr)` to return a sorted copy.\n",
+    "- `arr.reshape(new_shape)`: Returns a View with a new shape. One dimension can be -1, instructing NumPy to calculate the size automatically.\n",
+    "- `np.resize(arr, new_shape)`: Returns a new array with the specified shape. If the new shape is larger, it fills the new elements by repeating the original array.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1527b4f6-5d75-47d4-97e0-d0e78bbc59f9",
+   "metadata": {
+    "id": "1527b4f6-5d75-47d4-97e0-d0e78bbc59f9"
+   },
+   "outputs": [],
+   "source": [
+    "# SOLUTION: Create a new array with 5_000_000 elements containing equally spaced values between 0 to 1000 (inclusive).\n",
+    "# np.linspace returns 'num' evenly spaced samples over the closed interval [start, stop].\n",
+    "arr = np.linspace(0, 1000, 5_000_000)\n",
+    "arr"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f51aa2e-b994-4a91-aed6-4a4632eb7050",
+   "metadata": {
+    "id": "2f51aa2e-b994-4a91-aed6-4a4632eb7050"
+   },
+   "outputs": [],
+   "source": [
+    "# SOLUTION: Create a random array that is 10_000 rows by 5_000 columns.\n",
+    "# np.random.default_rng().random(size) returns random floats in [0.0, 1.0). size can be a tuple.\n",
+    "arr = np.random.default_rng().random((10_000, 5_000))\n",
+    "arr"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4ec06270-6e08-4cce-9385-9dc8b53e95fd",
+   "metadata": {
+    "id": "4ec06270-6e08-4cce-9385-9dc8b53e95fd"
+   },
+   "outputs": [],
+   "source": [
+    "# SOLUTION: Sort that array (in-place).\n",
+    "# Note: arr.sort() modifies the array directly, which is typically faster than creating a copy.\n",
+    "arr.sort()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cdde560b-5ba6-484c-a601-00b7ef71273d",
+   "metadata": {
+    "id": "cdde560b-5ba6-484c-a601-00b7ef71273d"
+   },
+   "outputs": [],
+   "source": [
+    "# SOLUTION: Reshape the array to have the last dimension of length 5. \n",
+    "# Using -1 lets NumPy automatically calculate the first dimension.\n",
+    "# .reshape() returns a View (not a copy) when possible, so no data is duplicated.\n",
+    "arr_new = arr.reshape(-1, 5)\n",
+    "arr_new"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "54982876",
+   "metadata": {},
+   "source": [
+    "## 4. Aggregations and Axes\n",
+    "\n",
+    "When performing aggregations (like `sum`, `mean`, `max`), you must specify the **Axis** you want to collapse (or reduce) the array along.\n",
+    "\n",
+    "- **Axis 0**: The first dimension (often rows in 2D). Aggregating across Axis 0 produces a result for each column.\n",
+    "- **Axis 1**: The second dimension (often columns in 2D). Aggregating across Axis 1 produces a result for each row.\n",
+    "\n",
+    "---\n",
+    "\n",
+    "**Quick Docs**\n",
+    "- `np.sum(a, axis=None)`: Sum of array elements over a given axis.\n",
+    "  - `axis=0`: Collapse the rows (sum vertical columns).\n",
+    "  - `axis=1`: Collapse the columns (sum horizontal rows).\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "44dd3ac2-c9b7-4327-ba63-860b074c0583",
+   "metadata": {
+    "id": "44dd3ac2-c9b7-4327-ba63-860b074c0583"
+   },
+   "outputs": [],
+   "source": [
+    "# SOLUTION: Find the sum of each row in the reshaped array (arr_new) above.\n",
+    "# To sum the row's content, we reduce across the columns (axis=1).\n",
+    "# axis=1 collapses the second dimension (columns), leaving one sum per row.\n",
+    "arr_sum = np.sum(arr_new, axis=1)\n",
+    "arr_sum"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ed072cee",
+   "metadata": {},
+   "source": [
+    "## 5. Broadcasting: The \"Stretch\" Rule\n",
+    "\n",
+    "Broadcasting is NumPy's mechanism for performing arithmetic between arrays of different shapes. If dimensions don't match, NumPy attempts to \"stretch\" the smaller array to match the larger one.\n",
+    "\n",
+    "**The Compatibility Rule:** Two dimensions are compatible when:\n",
+    "1. They are equal, or\n",
+    "2. One of them is 1.\n",
+    "\n",
+    "If a dimension is 1, NumPy logically copies that single value across the dimension to match the other array's shape **without allocating any new memory**.\n",
+    "\n",
+    "---\n",
+    "\n",
+    "**Quick Docs**\n",
+    "- **Arithmetic Operators** (`/`, `*`, `+`, `-`): These operate element-wise. Broadcasting occurs if shapes are different but compatible.\n",
+    "- `np.allclose(a, b)`: Returns `True` if two floating-point arrays are element-wise equal within a tolerance. Essential for comparisons instead of using `==`.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b15342af-2916-481a-9724-9874acf4ed24",
+   "metadata": {
+    "id": "b15342af-2916-481a-9724-9874acf4ed24"
+   },
+   "outputs": [],
+   "source": [
+    "# SOLUTION: Normalize each row of the 2D array (arr_new) by dividing by the sum you just computed (arr_sum).\n",
+    "# 'arr_new' has shape (M, N) and 'arr_sum' has shape (M,).\n",
+    "# To successfully divide, we reshape 'arr_sum' to (M, 1) so broadcasting can stretch it across the N columns.\n",
+    "# Alternative approaches: arr_sum[:, np.newaxis] or arr_sum[:, None] also work.\n",
+    "arr_normalized = arr_new / arr_sum.reshape(-1, 1)\n",
+    "arr_normalized"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b04622b8-c6de-4756-8a56-e3d2835a5eaf",
+   "metadata": {
+    "id": "b04622b8-c6de-4756-8a56-e3d2835a5eaf"
+   },
+   "outputs": [],
+   "source": [
+    "# SOLUTION (EXTRA CREDIT): Prove that your normalized array is actually normalized.\n",
+    "# If normalized correctly, the sum of every row should now be 1.0.\n",
+    "# We compute row sums (axis=1) and check if they are all close to 1.0 using np.allclose.\n",
+    "np.allclose(np.sum(arr_normalized, axis=1), 1.0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "31657dd2",
+   "metadata": {},
+   "source": [
+    "## 6. Why Vectorize? The Speed Advantage\n",
+    "\n",
+    "The entire Array Programming paradigm hinges on **Vectorization**.\n",
+    "\n",
+    "Why use complex shapes and broadcasting instead of simple Python `for` loops?\n",
+    "\n",
+    "NumPy's array functions are implemented in highly optimized native code (C/C++, Fortran). An operation like `A + A**2`, where `A` is a massive `ndarray`, is often $\\mathbf{100\\times}$ faster than performing the equivalent element-wise operation using explicit Python loops.\n",
+    "\n",
+    "**Always choose a vectorized NumPy function or operator over a manual Python loop.**"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
 }
diff --git a/tutorials/accelerated-python/notebooks/fundamentals/solutions/02__numpy_linear_algebra__svd_reconstruction__SOLUTION.ipynb b/tutorials/accelerated-python/notebooks/fundamentals/solutions/02__numpy_linear_algebra__svd_reconstruction__SOLUTION.ipynb
index 07758b7d..681ab415 100644
--- a/tutorials/accelerated-python/notebooks/fundamentals/solutions/02__numpy_linear_algebra__svd_reconstruction__SOLUTION.ipynb
+++ b/tutorials/accelerated-python/notebooks/fundamentals/solutions/02__numpy_linear_algebra__svd_reconstruction__SOLUTION.ipynb
@@ -444,6 +444,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -459,14 +465,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.11"
-  },
-  "colab": {
-   "provenance": [],
-   "gpuType": "T4"
-  },
-  "accelerator": "GPU"
+   "version": "3.11.7"
+  }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 5
 }
diff --git a/tutorials/accelerated-python/notebooks/fundamentals/solutions/03__numpy_to_cupy__ndarray_basics__SOLUTION.ipynb b/tutorials/accelerated-python/notebooks/fundamentals/solutions/03__numpy_to_cupy__ndarray_basics__SOLUTION.ipynb
index 52891823..8c15a669 100644
--- a/tutorials/accelerated-python/notebooks/fundamentals/solutions/03__numpy_to_cupy__ndarray_basics__SOLUTION.ipynb
+++ b/tutorials/accelerated-python/notebooks/fundamentals/solutions/03__numpy_to_cupy__ndarray_basics__SOLUTION.ipynb
@@ -1,583 +1,601 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "# Accelerated Computing with CuPy\n",
-        "\n",
-        "## Table of Contents\n",
-        "1. [Creating Arrays: CPU vs. GPU](#1.-Creating-Arrays:-CPU-vs.-GPU)\n",
-        "2. [Basic Operations](#2.-Basic-Operations)\n",
-        "   - [Sequential Operations & Memory](#Sequential-Operations-&-Memory)\n",
-        "3. [Complex Operations (Linear Algebra)](#3.-Complex-Operations-(Linear-Algebra))\n",
-        "   - [Agnostic Code (NumPy Dispatch)](#Agnostic-Code-(NumPy-Dispatch))\n",
-        "4. [Device Management](#4.-Device-Management)\n",
-        "5. [Exercise - NumPy to CuPy](#Exercise---NumPy-to-CuPy)\n",
-        "   - [Part 1](#Part-1)\n",
-        "   - [Part 2](#Part-2)\n",
-        "\n",
-        "---\n",
-        "\n",
-        "Let's shift gears to high-level array functionality using **[CuPy](https://cupy.dev/)**.\n",
-        "\n",
-        "### What is CuPy?\n",
-        "CuPy is a library that implements the familiar **NumPy API** but runs on the GPU (using CUDA C++ in the backend). \n",
-        "\n",
-        "**Why use it?**\n",
-        "* **Zero Friction:** If you know NumPy, you already know CuPy.\n",
-        "* **Speed:** It provides out-of-the-box GPU acceleration for array operations.\n",
-        "* **Ease of use:** You can often port CPU code to GPU simply by changing `import numpy as np` to `import cupy as cp`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "import numpy as np\n",
-        "import cupy as cp\n",
-        "from cupyx.profiler import benchmark\n",
-        "\n",
-        "# Helper to display benchmark results concisely.\n",
-        "# We use CuPy's benchmark() throughout this notebook for accurate GPU timing.\n",
-        "def print_benchmark(result, device=\"gpu\"):\n",
-        "    \"\"\"Print benchmark result showing only the relevant time.\"\"\"\n",
-        "    if device == \"gpu\":\n",
-        "        avg_ms = result.gpu_times.mean() * 1000\n",
-        "        std_ms = result.gpu_times.std() * 1000\n",
-        "        print(f\"{result.name}: {avg_ms:.3f} ms +/- {std_ms:.3f} ms\")\n",
-        "    else:\n",
-        "        avg_ms = result.cpu_times.mean() * 1000\n",
-        "        std_ms = result.cpu_times.std() * 1000\n",
-        "        print(f\"{result.name}: {avg_ms:.3f} ms +/- {std_ms:.3f} ms\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## 1. Creating Arrays: CPU vs. GPU\n",
-        "\n",
-        "Let's compare the performance of creating a large 3D array (approx. 2GB in size) on the CPU versus the GPU.\n",
-        "\n",
-        "We will use `np.ones` for the CPU and `cp.ones` for the GPU."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# CPU creation\n",
-        "print_benchmark(benchmark(np.ones, ((1000, 500, 500),), n_repeat=10), device=\"cpu\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# GPU creation\n",
-        "print_benchmark(benchmark(cp.ones, ((1000, 500, 500),), n_repeat=10), device=\"gpu\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "We can see here that creating this array on the GPU is much faster than doing so on the CPU!\n",
-        "\n",
-        "**About `cupyx.profiler.benchmark`:**\n",
-        "\n",
-        "We use CuPy's built-in `benchmark` utility for timing GPU operations. This is important because GPU operations are **asynchronous** - when you call a CuPy function, the CPU places a task in the GPU's \"to-do list\" (stream) and immediately moves on without waiting.\n",
-        "\n",
-        "The `benchmark` function handles all the complexity of proper GPU timing for us:\n",
-        "- It automatically synchronizes GPU streams to get accurate measurements.\n",
-        "- It runs warm-up iterations to avoid cold-start overhead.\n",
-        "- It reports both CPU and GPU times separately.\n",
-        "\n",
-        "This makes it the recommended way to time CuPy code, as it's both accurate and convenient."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## 2. Basic Operations\n",
-        "\n",
-        "The syntax for mathematical operations is identical. Let's multiply every value in our arrays by `5`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Create fresh arrays for the benchmark\n",
-        "x_cpu = np.ones((1000, 500, 500))\n",
-        "x_gpu = cp.ones((1000, 500, 500))\n",
-        "\n",
-        "def multiply(x):\n",
-        "    return x * 5\n",
-        "\n",
-        "# CPU Operation\n",
-        "print_benchmark(benchmark(multiply, (x_cpu,), n_repeat=10), device=\"cpu\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# GPU Operation\n",
-        "print_benchmark(benchmark(multiply, (x_gpu,), n_repeat=10), device=\"gpu\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "The GPU completes this operation notably faster, with the code staying the same."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Sequential Operations & Memory\n",
-        "\n",
-        "Now let's do a couple of operations sequentially, something which would suffer from memory transfer times in Numba examples without explicit memory management."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "def sequential_math(x):\n",
-        "    x = x * 5\n",
-        "    x = x * x\n",
-        "    x = x + x\n",
-        "    return x\n",
-        "\n",
-        "# CPU: Sequential math\n",
-        "print_benchmark(benchmark(sequential_math, (x_cpu,), n_repeat=10), device=\"cpu\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# GPU: Sequential math\n",
-        "print_benchmark(benchmark(sequential_math, (x_gpu,), n_repeat=10), device=\"gpu\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "The GPU ran that much faster even without us explicitly managing memory. This is because CuPy is handling all of this for us transparently."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## 3. Complex Operations (Linear Algebra)\n",
-        "\n",
-        "GPUs excel at Linear Algebra. Let's look at **Singular Value Decomposition (SVD)**, a computationally heavy $O(N^3)$ operation."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# CPU SVD\n",
-        "x_cpu = np.random.random((1000, 1000))\n",
-        "print_benchmark(benchmark(np.linalg.svd, (x_cpu,), n_repeat=5), device=\"cpu\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# GPU SVD\n",
-        "x_gpu = cp.random.random((1000, 1000))\n",
-        "print_benchmark(benchmark(cp.linalg.svd, (x_gpu,), n_repeat=5), device=\"gpu\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "The GPU outperforms the CPU again with exactly the same API!"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Agnostic Code (NumPy Dispatch)\n",
-        "\n",
-        "A key feature of CuPy is that many **NumPy functions work on CuPy arrays without changing your code**.\n",
-        "\n",
-        "When you pass a CuPy GPU array (`x_gpu`) into a NumPy function that supports the `__array_function__` protocol (e.g., `np.linalg.svd`), NumPy detects the CuPy input and **delegates the operation to CuPy's own implementation**, which runs on the GPU.\n",
-        "\n",
-        "This allows you to write code using standard `np.*` syntax and have it run on either CPU or GPU seamlessly - **as long as CuPy implements an override for that function.**\n",
-        "\n",
-        "CuPy also protects you from hidden performance penalties: **it forbids implicit GPU → CPU copies**, raising a `TypeError` when NumPy tries to convert a `cupy.ndarray` into a `numpy.ndarray` behind the scenes. This ensures all device-to-host transfers are **explicit and intentional**, never silent."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# We create the data on the GPU\n",
-        "x_gpu = cp.random.random((1000, 1000))\n",
-        "\n",
-        "# BUT we call the standard NumPy function - CuPy dispatches it to the GPU!\n",
-        "print_benchmark(benchmark(np.linalg.svd, (x_gpu,), n_repeat=5), device=\"gpu\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## 4. Device Management\n",
-        "\n",
-        "If you have multiple GPUs, CuPy uses the concept of a \"Current Device\" context. \n",
-        "\n",
-        "You can use a `with` statement to ensure specific arrays are created on specific cards (e.g., GPU 0 vs GPU 1)."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "with cp.cuda.Device(0):\n",
-        "   x_on_gpu0 = cp.random.random((100000, 1000))\n",
-        "\n",
-        "print(f\"Array is on device: {x_on_gpu0.device}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "**Note:** CuPy functions generally expect all input arrays to be on the **same** device. Passing an array stored on a non-current device may work depending on the hardware configuration but is generally discouraged as it may not be performant."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "---"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Exercise - NumPy to CuPy\n",
-        "\n",
-        "### Part 1\n",
-        "Let's put the \"Drop-in Replacement\" philosophy to the test with the same data pipeline as the previous notebook. Specifically, the single block of code below performs the following steps:\n",
-        "1) Generate a massive dataset (50 million elements).\n",
-        "2) Process it using a heavy operation (Sorting).\n",
-        "3) Manipulate the shape and normalize the data (Broadcasting).\n",
-        "4) Verify the integrity of the result.\n",
-        "\n",
-        "**TODO:**\n",
-        "1. Run the cell below with `xp = np` (CPU Mode). Note the benchmark output.\n",
-        "2. Change the setup line to `xp = cp` (GPU Mode). Run it again.\n",
-        "3. Observe how the exact same logic runs significantly faster on the GPU with CuPy while retaining the implementation properties of NumPy.\n",
-        "\n",
-        "Note: We use `cupyx.profiler.benchmark` for timing, which automatically handles GPU synchronization."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "import numpy as np\n",
-        "import cupy as cp\n",
-        "from cupyx.profiler import benchmark\n",
-        "\n",
-        "# Re-defined here so this exercise cell is self-contained and can run independently.\n",
-        "def print_benchmark(result, device=\"gpu\"):\n",
-        "    \"\"\"Print benchmark result showing only the relevant time.\"\"\"\n",
-        "    if device == \"gpu\":\n",
-        "        avg_ms = result.gpu_times.mean() * 1000\n",
-        "        std_ms = result.gpu_times.std() * 1000\n",
-        "    else:\n",
-        "        avg_ms = result.cpu_times.mean() * 1000\n",
-        "        std_ms = result.cpu_times.std() * 1000\n",
-        "    print(f\"  -> {result.name}: {avg_ms:.3f} ms +/- {std_ms:.3f} ms\")\n",
-        "\n",
-        "# --- 1. SETUP: CHOOSE YOUR DEVICE ---\n",
-        "# SOLUTION: Changed from 'np' to 'cp' for GPU acceleration\n",
-        "xp = cp  # Toggle this to 'np' for CPU mode\n",
-        "\n",
-        "print(f\"Running on: {xp.__name__.upper()}\")\n",
-        "\n",
-        "# --- 2. DATA GENERATION ---\n",
-        "N = 50_000_000\n",
-        "print(f\"Generating {N:,} random elements ({N*8/1e9:.2f} GB)...\")\n",
-        "arr = xp.random.rand(N)\n",
-        "\n",
-        "# --- 3. HEAVY COMPUTATION (TIMED) ---\n",
-        "print(\"Sorting data...\")\n",
-        "# benchmark() handles GPU synchronization automatically\n",
-        "result = benchmark(xp.sort, (arr,), n_repeat=5)\n",
-        "print_benchmark(result, device=\"gpu\" if xp == cp else \"cpu\")\n",
-        "\n",
-        "# --- 4. MANIPULATION & BROADCASTING ---\n",
-        "# Purpose: Demonstrate that CuPy supports complex reshaping and broadcasting rules exactly like NumPy.\n",
-        "# This shows you don't need to rewrite your data processing logic.\n",
-        "\n",
-        "# Reshape to a matrix with 5 columns\n",
-        "arr_new = arr.reshape((-1, 5))\n",
-        "\n",
-        "# Normalize: Divide every row by its sum using broadcasting\n",
-        "row_sums = arr_new.sum(axis=1)\n",
-        "normalized_matrix = arr_new / row_sums[:, xp.newaxis]\n",
-        "\n",
-        "# --- 5. VERIFICATION ---\n",
-        "# Purpose: Verify mathematical correctness/integrity of the result.\n",
-        "check_sums = xp.sum(normalized_matrix, axis=1)\n",
-        "xp.testing.assert_allclose(check_sums, 1.0)\n",
-        "\n",
-        "print(\"  -> Verification: PASSED (All rows sum to 1.0)\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "**TODO: When working with CuPy arrays, try changing `xp.testing.assert_allclose` to `np.testing.assert_allclose`. What happens and why?**"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "**SOLUTION:**\n",
-        "\n",
-        "When you change `xp.testing.assert_allclose` to `np.testing.assert_allclose` while working with CuPy arrays (`xp = cp`), you will get a **`TypeError`**.\n",
-        "\n",
-        "This happens because:\n",
-        "\n",
-        "1. `np.testing.assert_allclose` internally tries to convert its inputs to NumPy arrays.\n",
-        "2. CuPy arrays live on the GPU, and CuPy **explicitly forbids implicit GPU → CPU transfers**.\n",
-        "3. When NumPy's `assert_allclose` attempts to call `np.asarray()` on the CuPy array, CuPy raises a `TypeError` to prevent a silent (and potentially slow) data copy from GPU to CPU memory.\n",
-        "\n",
-        "This is a **safety feature** of CuPy! It ensures that all device-to-host transfers are **explicit and intentional**. "
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Part 2\n",
-        "We will now create a massive dataset (50 million points) representing a sine wave and see how fast the GPU can sort it compared to the CPU. \n",
-        "\n",
-        "**TODO:** \n",
-        "1) **Generate Data:** Create a NumPy array (`y_cpu`) and a CuPy array (`y_gpu`) representing $\\sin(x)$ from $0$ to $2\\pi$ with `50,000,000` points.\n",
-        "2) **Benchmark CPU and GPU:** Use `benchmark()` from `cupyx.profiler` to measure both `np.sort` and `cp.sort`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "import numpy as np\n",
-        "import cupy as cp\n",
-        "from cupyx.profiler import benchmark\n",
-        "\n",
-        "# --- Step 1: Generate Data ---\n",
-        "N = 50_000_000\n",
-        "print(f\"Generating {N:,} points...\")\n",
-        "\n",
-        "# SOLUTION: Create x_cpu using np.linspace from 0 to 2*pi\n",
-        "x_cpu = np.linspace(0, 2 * np.pi, N)\n",
-        "# SOLUTION: Create y_cpu by taking np.sin(x_cpu)\n",
-        "y_cpu = np.sin(x_cpu)\n",
-        "\n",
-        "# SOLUTION: Create x_gpu using cp.linspace from 0 to 2*pi\n",
-        "x_gpu = cp.linspace(0, 2 * cp.pi, N)\n",
-        "# SOLUTION: Create y_gpu by taking cp.sin(x_gpu)\n",
-        "y_gpu = cp.sin(x_gpu)\n",
-        "\n",
-        "print(f\"  CPU array shape: {y_cpu.shape}, dtype: {y_cpu.dtype}\")\n",
-        "print(f\"  GPU array shape: {y_gpu.shape}, dtype: {y_gpu.dtype}\")\n",
-        "\n",
-        "# --- Step 2: Benchmark NumPy (CPU) ---\n",
-        "print(\"\\nBenchmarking NumPy Sort (this may take a few seconds)...\")\n",
-        "# SOLUTION: Use benchmark with np.sort\n",
-        "cpu_result = benchmark(np.sort, (y_cpu,), n_repeat=5)\n",
-        "cpu_avg_ms = cpu_result.cpu_times.mean() * 1000\n",
-        "cpu_std_ms = cpu_result.cpu_times.std() * 1000\n",
-        "print(f\"  NumPy (CPU): {cpu_avg_ms:.3f} ms +/- {cpu_std_ms:.3f} ms\")\n",
-        "\n",
-        "# --- Step 3: Benchmark CuPy (GPU) ---\n",
-        "print(\"\\nBenchmarking CuPy Sort...\")\n",
-        "# SOLUTION: Use benchmark with cp.sort\n",
-        "gpu_result = benchmark(cp.sort, (y_gpu,), n_repeat=5)\n",
-        "gpu_avg_ms = gpu_result.gpu_times.mean() * 1000\n",
-        "gpu_std_ms = gpu_result.gpu_times.std() * 1000\n",
-        "print(f\"  CuPy (GPU): {gpu_avg_ms:.3f} ms +/- {gpu_std_ms:.3f} ms\")\n",
-        "\n",
-        "# --- Summary ---\n",
-        "print(f\"\\n*** GPU Speedup: {cpu_avg_ms / gpu_avg_ms:.1f}x faster ***\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "**EXTRA CREDIT: Benchmark with different array sizes and find the size at which CuPy and NumPy take the same amount of time. Try to extract the timing data from `cupyx.profiler.benchmark`'s return value and customize how the output is displayed. You could even make a graph.**"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# SOLUTION: Extra Credit - Finding the crossover point between CPU and GPU performance\n",
-        "\n",
-        "repeat = 10\n",
-        "warmup = 1\n",
-        "sizes = [5, 50, 500, 5_000, 50_000, 500_000, 5_000_000, 50_000_000]\n",
-        "\n",
-        "cpu_times = []\n",
-        "gpu_times = []\n",
-        "\n",
-        "print(\"Benchmarking different array sizes...\")\n",
-        "print(\"=\" * 70)\n",
-        "print(f\"{'Size':>15} | {'NumPy (CPU)':>15} | {'CuPy (GPU)':>15} | {'Winner':>10}\")\n",
-        "print(\"-\" * 70)\n",
-        "\n",
-        "for N in sizes:\n",
-        "    # Generate sine wave data\n",
-        "    y_cpu = np.sin(np.linspace(0, 2 * np.pi, N))\n",
-        "    y_gpu = cp.sin(cp.linspace(0, 2 * cp.pi, N))\n",
-        "\n",
-        "    # Benchmark CPU\n",
-        "    cpu_result = benchmark(np.sort, (y_cpu,), n_repeat=repeat, n_warmup=warmup)\n",
-        "    cpu_time_ms = cpu_result.cpu_times.mean() * 1000\n",
-        "    cpu_times.append(cpu_time_ms)\n",
-        "\n",
-        "    # Benchmark GPU\n",
-        "    gpu_result = benchmark(cp.sort, (y_gpu,), n_repeat=repeat, n_warmup=warmup)\n",
-        "    gpu_time_ms = gpu_result.gpu_times.mean() * 1000\n",
-        "    gpu_times.append(gpu_time_ms)\n",
-        "\n",
-        "    # Determine winner\n",
-        "    winner = \"GPU\" if gpu_time_ms < cpu_time_ms else \"CPU\"\n",
-        "\n",
-        "    print(f\"{N:>15,} | {cpu_time_ms:>12.3f} ms | {gpu_time_ms:>12.3f} ms | {winner:>10}\")\n",
-        "\n",
-        "print(\"=\" * 70)\n",
-        "\n",
-        "# Find approximate crossover point\n",
-        "crossover_idx = None\n",
-        "for i in range(len(sizes) - 1):\n",
-        "    # Check if GPU becomes faster between size[i] and size[i+1]\n",
-        "    if cpu_times[i] <= gpu_times[i] and cpu_times[i+1] > gpu_times[i+1]:\n",
-        "        crossover_idx = i\n",
-        "        break\n",
-        "\n",
-        "if crossover_idx is not None:\n",
-        "    print(f\"\\nCrossover point: GPU becomes faster between {sizes[crossover_idx]:,} and {sizes[crossover_idx+1]:,} elements\")\n",
-        "else:\n",
-        "    if gpu_times[0] < cpu_times[0]:\n",
-        "        print(f\"\\nGPU is faster for all tested sizes (even at {sizes[0]:,} elements)\")\n",
-        "    else:\n",
-        "        print(f\"\\nCPU is faster for all tested sizes (even at {sizes[-1]:,} elements)\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# SOLUTION: Extra Credit (continued) - Visualization\n",
-        "\n",
-        "import matplotlib.pyplot as plt\n",
-        "\n",
-        "# Create the plot\n",
-        "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))\n",
-        "\n",
-        "# Plot 1: Absolute times (log-log scale)\n",
-        "ax1.loglog(sizes, cpu_times, 'b-o', label='NumPy (CPU)', linewidth=2, markersize=8)\n",
-        "ax1.loglog(sizes, gpu_times, 'r-s', label='CuPy (GPU)', linewidth=2, markersize=8)\n",
-        "for x, y in zip(sizes, cpu_times):\n",
-        "  ax1.text(x, y, f'{x:.3g}', va='bottom', ha='right')\n",
-        "for x, y in zip(sizes, gpu_times):\n",
-        "  ax1.text(x, y, f'{x:.3g}', va='top', ha='left')\n",
-        "ax1.set_xlabel('Array Size (elements)', fontsize=12)\n",
-        "ax1.set_ylabel('Time (ms)', fontsize=12)\n",
-        "ax1.set_title('Sort Performance: CPU vs GPU', fontsize=14)\n",
-        "ax1.legend(fontsize=11)\n",
-        "ax1.grid(True, alpha=0.3)\n",
-        "\n",
-        "# Plot 2: Speedup ratio\n",
-        "speedups = [cpu / gpu for cpu, gpu in zip(cpu_times, gpu_times)]\n",
-        "colors = ['green' if s > 1 else 'red' for s in speedups]\n",
-        "ax2.bar(range(len(sizes)), speedups, color=colors, alpha=0.7, edgecolor='black')\n",
-        "ax2.axhline(y=1.0, color='black', linestyle='--', linewidth=2, label='Break-even')\n",
-        "ax2.set_xticks(range(len(sizes)))\n",
-        "ax2.set_xticklabels([f'{s:,}' for s in sizes], rotation=45, ha='right', fontsize=9)\n",
-        "ax2.set_xlabel('Array Size (elements)', fontsize=12)\n",
-        "ax2.set_ylabel('GPU Speedup (CPU time / GPU time)', fontsize=12)\n",
-        "ax2.set_title('GPU Speedup Factor', fontsize=14)\n",
-        "ax2.legend(fontsize=11)\n",
-        "ax2.grid(True, axis='y', alpha=0.3)\n",
-        "\n",
-        "# Add value labels on bars\n",
-        "for i, (speedup, color) in enumerate(zip(speedups, colors)):\n",
-        "    label = f'{speedup:.1f}x'\n",
-        "    ax2.annotate(label, (i, speedup), textcoords=\"offset points\",\n",
-        "                 xytext=(0, 5), ha='center', fontsize=9, fontweight='bold')\n",
-        "\n",
-        "plt.tight_layout()\n",
-        "plt.show()\n",
-        "\n",
-        "print(\"\\n*** Analysis Complete ***\")\n",
-        "print(f\"Maximum GPU speedup: {max(speedups):.1f}x at {sizes[speedups.index(max(speedups))]:,} elements\")"
-      ]
-    }
-  ],
-  "metadata": {
-    "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
-      "language": "python",
-      "name": "python3"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 2
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Accelerated Computing with CuPy\n",
+    "\n",
+    "## Table of Contents\n",
+    "1. [Creating Arrays: CPU vs. GPU](#1.-Creating-Arrays:-CPU-vs.-GPU)\n",
+    "2. [Basic Operations](#2.-Basic-Operations)\n",
+    "   - [Sequential Operations & Memory](#Sequential-Operations-&-Memory)\n",
+    "3. [Complex Operations (Linear Algebra)](#3.-Complex-Operations-(Linear-Algebra))\n",
+    "   - [Agnostic Code (NumPy Dispatch)](#Agnostic-Code-(NumPy-Dispatch))\n",
+    "4. [Device Management](#4.-Device-Management)\n",
+    "5. [Exercise - NumPy to CuPy](#Exercise---NumPy-to-CuPy)\n",
+    "   - [Part 1](#Part-1)\n",
+    "   - [Part 2](#Part-2)\n",
+    "\n",
+    "---\n",
+    "\n",
+    "Let's shift gears to high-level array functionality using **[CuPy](https://cupy.dev/)**.\n",
+    "\n",
+    "### What is CuPy?\n",
+    "CuPy is a library that implements the familiar **NumPy API** but runs on the GPU (using CUDA C++ in the backend). \n",
+    "\n",
+    "**Why use it?**\n",
+    "* **Zero Friction:** If you know NumPy, you already know CuPy.\n",
+    "* **Speed:** It provides out-of-the-box GPU acceleration for array operations.\n",
+    "* **Ease of use:** You can often port CPU code to GPU simply by changing `import numpy as np` to `import cupy as cp`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import cupy as cp\n",
+    "from cupyx.profiler import benchmark\n",
+    "\n",
+    "# Helper to display benchmark results concisely.\n",
+    "# We use CuPy's benchmark() throughout this notebook for accurate GPU timing.\n",
+    "def print_benchmark(result, device=\"gpu\"):\n",
+    "    \"\"\"Print benchmark result showing only the relevant time.\"\"\"\n",
+    "    if device == \"gpu\":\n",
+    "        avg_ms = result.gpu_times.mean() * 1000\n",
+    "        std_ms = result.gpu_times.std() * 1000\n",
+    "        print(f\"{result.name}: {avg_ms:.3f} ms +/- {std_ms:.3f} ms\")\n",
+    "    else:\n",
+    "        avg_ms = result.cpu_times.mean() * 1000\n",
+    "        std_ms = result.cpu_times.std() * 1000\n",
+    "        print(f\"{result.name}: {avg_ms:.3f} ms +/- {std_ms:.3f} ms\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Creating Arrays: CPU vs. GPU\n",
+    "\n",
+    "Let's compare the performance of creating a large 3D array (approx. 2GB in size) on the CPU versus the GPU.\n",
+    "\n",
+    "We will use `np.ones` for the CPU and `cp.ones` for the GPU."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# CPU creation\n",
+    "print_benchmark(benchmark(np.ones, ((1000, 500, 500),), n_repeat=10), device=\"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# GPU creation\n",
+    "print_benchmark(benchmark(cp.ones, ((1000, 500, 500),), n_repeat=10), device=\"gpu\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can see here that creating this array on the GPU is much faster than doing so on the CPU!\n",
+    "\n",
+    "**About `cupyx.profiler.benchmark`:**\n",
+    "\n",
+    "We use CuPy's built-in `benchmark` utility for timing GPU operations. This is important because GPU operations are **asynchronous** - when you call a CuPy function, the CPU places a task in the GPU's \"to-do list\" (stream) and immediately moves on without waiting.\n",
+    "\n",
+    "The `benchmark` function handles all the complexity of proper GPU timing for us:\n",
+    "- It automatically synchronizes GPU streams to get accurate measurements.\n",
+    "- It runs warm-up iterations to avoid cold-start overhead.\n",
+    "- It reports both CPU and GPU times separately.\n",
+    "\n",
+    "This makes it the recommended way to time CuPy code, as it's both accurate and convenient."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Basic Operations\n",
+    "\n",
+    "The syntax for mathematical operations is identical. Let's multiply every value in our arrays by `5`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create fresh arrays for the benchmark\n",
+    "x_cpu = np.ones((1000, 500, 500))\n",
+    "x_gpu = cp.ones((1000, 500, 500))\n",
+    "\n",
+    "def multiply(x):\n",
+    "    return x * 5\n",
+    "\n",
+    "# CPU Operation\n",
+    "print_benchmark(benchmark(multiply, (x_cpu,), n_repeat=10), device=\"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# GPU Operation\n",
+    "print_benchmark(benchmark(multiply, (x_gpu,), n_repeat=10), device=\"gpu\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The GPU completes this operation notably faster, with the code staying the same."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Sequential Operations & Memory\n",
+    "\n",
+    "Now let's do a couple of operations sequentially, something which would suffer from memory transfer times in Numba examples without explicit memory management."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def sequential_math(x):\n",
+    "    x = x * 5\n",
+    "    x = x * x\n",
+    "    x = x + x\n",
+    "    return x\n",
+    "\n",
+    "# CPU: Sequential math\n",
+    "print_benchmark(benchmark(sequential_math, (x_cpu,), n_repeat=10), device=\"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# GPU: Sequential math\n",
+    "print_benchmark(benchmark(sequential_math, (x_gpu,), n_repeat=10), device=\"gpu\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The GPU ran that much faster even without us explicitly managing memory. This is because CuPy is handling all of this for us transparently."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Complex Operations (Linear Algebra)\n",
+    "\n",
+    "GPUs excel at Linear Algebra. Let's look at **Singular Value Decomposition (SVD)**, a computationally heavy $O(N^3)$ operation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# CPU SVD\n",
+    "x_cpu = np.random.random((1000, 1000))\n",
+    "print_benchmark(benchmark(np.linalg.svd, (x_cpu,), n_repeat=5), device=\"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# GPU SVD\n",
+    "x_gpu = cp.random.random((1000, 1000))\n",
+    "print_benchmark(benchmark(cp.linalg.svd, (x_gpu,), n_repeat=5), device=\"gpu\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The GPU outperforms the CPU again with exactly the same API!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Agnostic Code (NumPy Dispatch)\n",
+    "\n",
+    "A key feature of CuPy is that many **NumPy functions work on CuPy arrays without changing your code**.\n",
+    "\n",
+    "When you pass a CuPy GPU array (`x_gpu`) into a NumPy function that supports the `__array_function__` protocol (e.g., `np.linalg.svd`), NumPy detects the CuPy input and **delegates the operation to CuPy's own implementation**, which runs on the GPU.\n",
+    "\n",
+    "This allows you to write code using standard `np.*` syntax and have it run on either CPU or GPU seamlessly - **as long as CuPy implements an override for that function.**\n",
+    "\n",
+    "CuPy also protects you from hidden performance penalties: **it forbids implicit GPU → CPU copies**, raising a `TypeError` when NumPy tries to convert a `cupy.ndarray` into a `numpy.ndarray` behind the scenes. This ensures all device-to-host transfers are **explicit and intentional**, never silent."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We create the data on the GPU\n",
+    "x_gpu = cp.random.random((1000, 1000))\n",
+    "\n",
+    "# BUT we call the standard NumPy function - CuPy dispatches it to the GPU!\n",
+    "print_benchmark(benchmark(np.linalg.svd, (x_gpu,), n_repeat=5), device=\"gpu\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Device Management\n",
+    "\n",
+    "If you have multiple GPUs, CuPy uses the concept of a \"Current Device\" context. \n",
+    "\n",
+    "You can use a `with` statement to ensure specific arrays are created on specific cards (e.g., GPU 0 vs GPU 1)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with cp.cuda.Device(0):\n",
+    "   x_on_gpu0 = cp.random.random((100000, 1000))\n",
+    "\n",
+    "print(f\"Array is on device: {x_on_gpu0.device}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Note:** CuPy functions generally expect all input arrays to be on the **same** device. Passing an array stored on a non-current device may work depending on the hardware configuration but is generally discouraged as it may not be performant."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Exercise - NumPy to CuPy\n",
+    "\n",
+    "### Part 1\n",
+    "Let's put the \"Drop-in Replacement\" philosophy to the test with the same data pipeline as the previous notebook. Specifically, the single block of code below performs the following steps:\n",
+    "1) Generate a massive dataset (50 million elements).\n",
+    "2) Process it using a heavy operation (Sorting).\n",
+    "3) Manipulate the shape and normalize the data (Broadcasting).\n",
+    "4) Verify the integrity of the result.\n",
+    "\n",
+    "**TODO:**\n",
+    "1. Run the cell below with `xp = np` (CPU Mode). Note the benchmark output.\n",
+    "2. Change the setup line to `xp = cp` (GPU Mode). Run it again.\n",
+    "3. Observe how the exact same logic runs significantly faster on the GPU with CuPy while retaining the implementation properties of NumPy.\n",
+    "\n",
+    "Note: We use `cupyx.profiler.benchmark` for timing, which automatically handles GPU synchronization."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import cupy as cp\n",
+    "from cupyx.profiler import benchmark\n",
+    "\n",
+    "# Re-defined here so this exercise cell is self-contained and can run independently.\n",
+    "def print_benchmark(result, device=\"gpu\"):\n",
+    "    \"\"\"Print benchmark result showing only the relevant time.\"\"\"\n",
+    "    if device == \"gpu\":\n",
+    "        avg_ms = result.gpu_times.mean() * 1000\n",
+    "        std_ms = result.gpu_times.std() * 1000\n",
+    "    else:\n",
+    "        avg_ms = result.cpu_times.mean() * 1000\n",
+    "        std_ms = result.cpu_times.std() * 1000\n",
+    "    print(f\"  -> {result.name}: {avg_ms:.3f} ms +/- {std_ms:.3f} ms\")\n",
+    "\n",
+    "# --- 1. SETUP: CHOOSE YOUR DEVICE ---\n",
+    "# SOLUTION: Changed from 'np' to 'cp' for GPU acceleration\n",
+    "xp = cp  # Toggle this to 'np' for CPU mode\n",
+    "\n",
+    "print(f\"Running on: {xp.__name__.upper()}\")\n",
+    "\n",
+    "# --- 2. DATA GENERATION ---\n",
+    "N = 50_000_000\n",
+    "print(f\"Generating {N:,} random elements ({N*8/1e9:.2f} GB)...\")\n",
+    "arr = xp.random.rand(N)\n",
+    "\n",
+    "# --- 3. HEAVY COMPUTATION (TIMED) ---\n",
+    "print(\"Sorting data...\")\n",
+    "# benchmark() handles GPU synchronization automatically\n",
+    "result = benchmark(xp.sort, (arr,), n_repeat=5)\n",
+    "print_benchmark(result, device=\"gpu\" if xp == cp else \"cpu\")\n",
+    "\n",
+    "# --- 4. MANIPULATION & BROADCASTING ---\n",
+    "# Purpose: Demonstrate that CuPy supports complex reshaping and broadcasting rules exactly like NumPy.\n",
+    "# This shows you don't need to rewrite your data processing logic.\n",
+    "\n",
+    "# Reshape to a matrix with 5 columns\n",
+    "arr_new = arr.reshape((-1, 5))\n",
+    "\n",
+    "# Normalize: Divide every row by its sum using broadcasting\n",
+    "row_sums = arr_new.sum(axis=1)\n",
+    "normalized_matrix = arr_new / row_sums[:, xp.newaxis]\n",
+    "\n",
+    "# --- 5. VERIFICATION ---\n",
+    "# Purpose: Verify mathematical correctness/integrity of the result.\n",
+    "check_sums = xp.sum(normalized_matrix, axis=1)\n",
+    "xp.testing.assert_allclose(check_sums, 1.0)\n",
+    "\n",
+    "print(\"  -> Verification: PASSED (All rows sum to 1.0)\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**TODO: When working with CuPy arrays, try changing `xp.testing.assert_allclose` to `np.testing.assert_allclose`. What happens and why?**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**SOLUTION:**\n",
+    "\n",
+    "When you change `xp.testing.assert_allclose` to `np.testing.assert_allclose` while working with CuPy arrays (`xp = cp`), you will get a **`TypeError`**.\n",
+    "\n",
+    "This happens because:\n",
+    "\n",
+    "1. `np.testing.assert_allclose` internally tries to convert its inputs to NumPy arrays.\n",
+    "2. CuPy arrays live on the GPU, and CuPy **explicitly forbids implicit GPU → CPU transfers**.\n",
+    "3. When NumPy's `assert_allclose` attempts to call `np.asarray()` on the CuPy array, CuPy raises a `TypeError` to prevent a silent (and potentially slow) data copy from GPU to CPU memory.\n",
+    "\n",
+    "This is a **safety feature** of CuPy! It ensures that all device-to-host transfers are **explicit and intentional**. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Part 2\n",
+    "We will now create a massive dataset (50 million points) representing a sine wave and see how fast the GPU can sort it compared to the CPU. \n",
+    "\n",
+    "**TODO:** \n",
+    "1) **Generate Data:** Create a NumPy array (`y_cpu`) and a CuPy array (`y_gpu`) representing $\\sin(x)$ from $0$ to $2\\pi$ with `50,000,000` points.\n",
+    "2) **Benchmark CPU and GPU:** Use `benchmark()` from `cupyx.profiler` to measure both `np.sort` and `cp.sort`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import cupy as cp\n",
+    "from cupyx.profiler import benchmark\n",
+    "\n",
+    "# --- Step 1: Generate Data ---\n",
+    "N = 50_000_000\n",
+    "print(f\"Generating {N:,} points...\")\n",
+    "\n",
+    "# SOLUTION: Create x_cpu using np.linspace from 0 to 2*pi\n",
+    "x_cpu = np.linspace(0, 2 * np.pi, N)\n",
+    "# SOLUTION: Create y_cpu by taking np.sin(x_cpu)\n",
+    "y_cpu = np.sin(x_cpu)\n",
+    "\n",
+    "# SOLUTION: Create x_gpu using cp.linspace from 0 to 2*pi\n",
+    "x_gpu = cp.linspace(0, 2 * cp.pi, N)\n",
+    "# SOLUTION: Create y_gpu by taking cp.sin(x_gpu)\n",
+    "y_gpu = cp.sin(x_gpu)\n",
+    "\n",
+    "print(f\"  CPU array shape: {y_cpu.shape}, dtype: {y_cpu.dtype}\")\n",
+    "print(f\"  GPU array shape: {y_gpu.shape}, dtype: {y_gpu.dtype}\")\n",
+    "\n",
+    "# --- Step 2: Benchmark NumPy (CPU) ---\n",
+    "print(\"\\nBenchmarking NumPy Sort (this may take a few seconds)...\")\n",
+    "# SOLUTION: Use benchmark with np.sort\n",
+    "cpu_result = benchmark(np.sort, (y_cpu,), n_repeat=5)\n",
+    "cpu_avg_ms = cpu_result.cpu_times.mean() * 1000\n",
+    "cpu_std_ms = cpu_result.cpu_times.std() * 1000\n",
+    "print(f\"  NumPy (CPU): {cpu_avg_ms:.3f} ms +/- {cpu_std_ms:.3f} ms\")\n",
+    "\n",
+    "# --- Step 3: Benchmark CuPy (GPU) ---\n",
+    "print(\"\\nBenchmarking CuPy Sort...\")\n",
+    "# SOLUTION: Use benchmark with cp.sort\n",
+    "gpu_result = benchmark(cp.sort, (y_gpu,), n_repeat=5)\n",
+    "gpu_avg_ms = gpu_result.gpu_times.mean() * 1000\n",
+    "gpu_std_ms = gpu_result.gpu_times.std() * 1000\n",
+    "print(f\"  CuPy (GPU): {gpu_avg_ms:.3f} ms +/- {gpu_std_ms:.3f} ms\")\n",
+    "\n",
+    "# --- Summary ---\n",
+    "print(f\"\\n*** GPU Speedup: {cpu_avg_ms / gpu_avg_ms:.1f}x faster ***\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**EXTRA CREDIT: Benchmark with different array sizes and find the size at which CuPy and NumPy take the same amount of time. Try to extract the timing data from `cupyx.profiler.benchmark`'s return value and customize how the output is displayed. You could even make a graph.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# SOLUTION: Extra Credit - Finding the crossover point between CPU and GPU performance\n",
+    "\n",
+    "repeat = 10\n",
+    "warmup = 1\n",
+    "sizes = [5, 50, 500, 5_000, 50_000, 500_000, 5_000_000, 50_000_000]\n",
+    "\n",
+    "cpu_times = []\n",
+    "gpu_times = []\n",
+    "\n",
+    "print(\"Benchmarking different array sizes...\")\n",
+    "print(\"=\" * 70)\n",
+    "print(f\"{'Size':>15} | {'NumPy (CPU)':>15} | {'CuPy (GPU)':>15} | {'Winner':>10}\")\n",
+    "print(\"-\" * 70)\n",
+    "\n",
+    "for N in sizes:\n",
+    "    # Generate sine wave data\n",
+    "    y_cpu = np.sin(np.linspace(0, 2 * np.pi, N))\n",
+    "    y_gpu = cp.sin(cp.linspace(0, 2 * cp.pi, N))\n",
+    "\n",
+    "    # Benchmark CPU\n",
+    "    cpu_result = benchmark(np.sort, (y_cpu,), n_repeat=repeat, n_warmup=warmup)\n",
+    "    cpu_time_ms = cpu_result.cpu_times.mean() * 1000\n",
+    "    cpu_times.append(cpu_time_ms)\n",
+    "\n",
+    "    # Benchmark GPU\n",
+    "    gpu_result = benchmark(cp.sort, (y_gpu,), n_repeat=repeat, n_warmup=warmup)\n",
+    "    gpu_time_ms = gpu_result.gpu_times.mean() * 1000\n",
+    "    gpu_times.append(gpu_time_ms)\n",
+    "\n",
+    "    # Determine winner\n",
+    "    winner = \"GPU\" if gpu_time_ms < cpu_time_ms else \"CPU\"\n",
+    "\n",
+    "    print(f\"{N:>15,} | {cpu_time_ms:>12.3f} ms | {gpu_time_ms:>12.3f} ms | {winner:>10}\")\n",
+    "\n",
+    "print(\"=\" * 70)\n",
+    "\n",
+    "# Find approximate crossover point\n",
+    "crossover_idx = None\n",
+    "for i in range(len(sizes) - 1):\n",
+    "    # Check if GPU becomes faster between size[i] and size[i+1]\n",
+    "    if cpu_times[i] <= gpu_times[i] and cpu_times[i+1] > gpu_times[i+1]:\n",
+    "        crossover_idx = i\n",
+    "        break\n",
+    "\n",
+    "if crossover_idx is not None:\n",
+    "    print(f\"\\nCrossover point: GPU becomes faster between {sizes[crossover_idx]:,} and {sizes[crossover_idx+1]:,} elements\")\n",
+    "else:\n",
+    "    if gpu_times[0] < cpu_times[0]:\n",
+    "        print(f\"\\nGPU is faster for all tested sizes (even at {sizes[0]:,} elements)\")\n",
+    "    else:\n",
+    "        print(f\"\\nCPU is faster for all tested sizes (even at {sizes[-1]:,} elements)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# SOLUTION: Extra Credit (continued) - Visualization\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "# Create the plot\n",
+    "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))\n",
+    "\n",
+    "# Plot 1: Absolute times (log-log scale)\n",
+    "ax1.loglog(sizes, cpu_times, 'b-o', label='NumPy (CPU)', linewidth=2, markersize=8)\n",
+    "ax1.loglog(sizes, gpu_times, 'r-s', label='CuPy (GPU)', linewidth=2, markersize=8)\n",
+    "for x, y in zip(sizes, cpu_times):\n",
+    "  ax1.text(x, y, f'{x:.3g}', va='bottom', ha='right')\n",
+    "for x, y in zip(sizes, gpu_times):\n",
+    "  ax1.text(x, y, f'{x:.3g}', va='top', ha='left')\n",
+    "ax1.set_xlabel('Array Size (elements)', fontsize=12)\n",
+    "ax1.set_ylabel('Time (ms)', fontsize=12)\n",
+    "ax1.set_title('Sort Performance: CPU vs GPU', fontsize=14)\n",
+    "ax1.legend(fontsize=11)\n",
+    "ax1.grid(True, alpha=0.3)\n",
+    "\n",
+    "# Plot 2: Speedup ratio\n",
+    "speedups = [cpu / gpu for cpu, gpu in zip(cpu_times, gpu_times)]\n",
+    "colors = ['green' if s > 1 else 'red' for s in speedups]\n",
+    "ax2.bar(range(len(sizes)), speedups, color=colors, alpha=0.7, edgecolor='black')\n",
+    "ax2.axhline(y=1.0, color='black', linestyle='--', linewidth=2, label='Break-even')\n",
+    "ax2.set_xticks(range(len(sizes)))\n",
+    "ax2.set_xticklabels([f'{s:,}' for s in sizes], rotation=45, ha='right', fontsize=9)\n",
+    "ax2.set_xlabel('Array Size (elements)', fontsize=12)\n",
+    "ax2.set_ylabel('GPU Speedup (CPU time / GPU time)', fontsize=12)\n",
+    "ax2.set_title('GPU Speedup Factor', fontsize=14)\n",
+    "ax2.legend(fontsize=11)\n",
+    "ax2.grid(True, axis='y', alpha=0.3)\n",
+    "\n",
+    "# Add value labels on bars\n",
+    "for i, (speedup, color) in enumerate(zip(speedups, colors)):\n",
+    "    label = f'{speedup:.1f}x'\n",
+    "    ax2.annotate(label, (i, speedup), textcoords=\"offset points\",\n",
+    "                 xytext=(0, 5), ha='center', fontsize=9, fontweight='bold')\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
+    "\n",
+    "print(\"\\n*** Analysis Complete ***\")\n",
+    "print(f\"Maximum GPU speedup: {max(speedups):.1f}x at {sizes[speedups.index(max(speedups))]:,} elements\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
 }
diff --git a/tutorials/accelerated-python/notebooks/fundamentals/solutions/04__numpy_to_cupy__svd_reconstruction__SOLUTION.ipynb b/tutorials/accelerated-python/notebooks/fundamentals/solutions/04__numpy_to_cupy__svd_reconstruction__SOLUTION.ipynb
index 6cf25ff5..832dc238 100644
--- a/tutorials/accelerated-python/notebooks/fundamentals/solutions/04__numpy_to_cupy__svd_reconstruction__SOLUTION.ipynb
+++ b/tutorials/accelerated-python/notebooks/fundamentals/solutions/04__numpy_to_cupy__svd_reconstruction__SOLUTION.ipynb
@@ -662,7 +662,8 @@
   "accelerator": "GPU",
   "colab": {
    "gpuType": "T4",
-   "provenance": []
+   "provenance": [],
+   "toc_visible": true
   },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
@@ -679,9 +680,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 5
 }
diff --git a/tutorials/accelerated-python/notebooks/fundamentals/solutions/05__memory_spaces__power_iteration__SOLUTION.ipynb b/tutorials/accelerated-python/notebooks/fundamentals/solutions/05__memory_spaces__power_iteration__SOLUTION.ipynb
index 9a14d16a..765af6c9 100644
--- a/tutorials/accelerated-python/notebooks/fundamentals/solutions/05__memory_spaces__power_iteration__SOLUTION.ipynb
+++ b/tutorials/accelerated-python/notebooks/fundamentals/solutions/05__memory_spaces__power_iteration__SOLUTION.ipynb
@@ -540,12 +540,30 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 5
 }
diff --git a/tutorials/accelerated-python/notebooks/fundamentals/solutions/06__asynchrony__power_iteration__SOLUTION.ipynb b/tutorials/accelerated-python/notebooks/fundamentals/solutions/06__asynchrony__power_iteration__SOLUTION.ipynb
index 99328e66..9cdfe440 100644
--- a/tutorials/accelerated-python/notebooks/fundamentals/solutions/06__asynchrony__power_iteration__SOLUTION.ipynb
+++ b/tutorials/accelerated-python/notebooks/fundamentals/solutions/06__asynchrony__power_iteration__SOLUTION.ipynb
@@ -447,14 +447,27 @@
   "accelerator": "GPU",
   "colab": {
    "gpuType": "T4",
-   "provenance": []
+   "provenance": [],
+   "toc_visible": true
   },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 5
 }
diff --git a/tutorials/accelerated-python/notebooks/fundamentals/solutions/07__cuda_core__devices_streams_and_memory__SOLUTION.ipynb b/tutorials/accelerated-python/notebooks/fundamentals/solutions/07__cuda_core__devices_streams_and_memory__SOLUTION.ipynb
index 788c25cb..2df1853e 100644
--- a/tutorials/accelerated-python/notebooks/fundamentals/solutions/07__cuda_core__devices_streams_and_memory__SOLUTION.ipynb
+++ b/tutorials/accelerated-python/notebooks/fundamentals/solutions/07__cuda_core__devices_streams_and_memory__SOLUTION.ipynb
@@ -145,9 +145,11 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
   "colab": {
    "gpuType": "T4",
-   "provenance": []
+   "provenance": [],
+   "toc_visible": true
   },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
@@ -164,7 +166,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/accelerated-python/notebooks/kernels/40__kernel_authoring__copy.ipynb b/tutorials/accelerated-python/notebooks/kernels/40__kernel_authoring__copy.ipynb
index 1b722bbd..d1ac1a82 100644
--- a/tutorials/accelerated-python/notebooks/kernels/40__kernel_authoring__copy.ipynb
+++ b/tutorials/accelerated-python/notebooks/kernels/40__kernel_authoring__copy.ipynb
@@ -329,14 +329,27 @@
   "accelerator": "GPU",
   "colab": {
    "gpuType": "T4",
-   "provenance": []
+   "provenance": [],
+   "toc_visible": true
   },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 5
 }
diff --git a/tutorials/accelerated-python/notebooks/kernels/41__kernel_authoring__book_histogram.ipynb b/tutorials/accelerated-python/notebooks/kernels/41__kernel_authoring__book_histogram.ipynb
index cb229c1e..c6a0b6f6 100644
--- a/tutorials/accelerated-python/notebooks/kernels/41__kernel_authoring__book_histogram.ipynb
+++ b/tutorials/accelerated-python/notebooks/kernels/41__kernel_authoring__book_histogram.ipynb
@@ -418,12 +418,25 @@
   "accelerator": "GPU",
   "colab": {
    "gpuType": "T4",
-   "provenance": []
+   "provenance": [],
+   "toc_visible": true
   },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/accelerated-python/notebooks/kernels/42__kernel_authoring__gaussian_blur.ipynb b/tutorials/accelerated-python/notebooks/kernels/42__kernel_authoring__gaussian_blur.ipynb
index 85bc8e98..9d461764 100644
--- a/tutorials/accelerated-python/notebooks/kernels/42__kernel_authoring__gaussian_blur.ipynb
+++ b/tutorials/accelerated-python/notebooks/kernels/42__kernel_authoring__gaussian_blur.ipynb
@@ -322,7 +322,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/accelerated-python/notebooks/kernels/43__kernel_authoring__black_and_white.ipynb b/tutorials/accelerated-python/notebooks/kernels/43__kernel_authoring__black_and_white.ipynb
index d227c4e2..3ddd380e 100644
--- a/tutorials/accelerated-python/notebooks/kernels/43__kernel_authoring__black_and_white.ipynb
+++ b/tutorials/accelerated-python/notebooks/kernels/43__kernel_authoring__black_and_white.ipynb
@@ -197,7 +197,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/accelerated-python/notebooks/kernels/solutions/40__kernel_authoring__copy__SOLUTION.ipynb b/tutorials/accelerated-python/notebooks/kernels/solutions/40__kernel_authoring__copy__SOLUTION.ipynb
index dc86022b..26071cc6 100644
--- a/tutorials/accelerated-python/notebooks/kernels/solutions/40__kernel_authoring__copy__SOLUTION.ipynb
+++ b/tutorials/accelerated-python/notebooks/kernels/solutions/40__kernel_authoring__copy__SOLUTION.ipynb
@@ -615,7 +615,8 @@
   "accelerator": "GPU",
   "colab": {
    "gpuType": "T4",
-   "provenance": []
+   "provenance": [],
+   "toc_visible": true
   },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
@@ -632,2469 +633,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  },
-  "widgets": {
-   "application/vnd.jupyter.widget-state+json": {
-    "0495f43209454c7dade96e09491b0a59": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "06d6f26f29494f45abcd43b287163314": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "143463ba065a4b96a6adbe97880b6330": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_17149f4044374e30b0926ee8683a08b6",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Warp State\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Warp Cycles Per Issued Instruction | cycle | 49.09 |\n| Warp Cycles Per Executed Instruction | cycle | 49.09 |\n| Avg. Active Threads Per Warp |  | 32 |\n| Avg. Not Predicated Off Threads Per Warp |  | 31.89 |\n\n🔧 **OPTIMIZATION**: On average, each warp of this kernel spends 45.3 cycles being stalled waiting for a scoreboard dependency on a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently used data to shared memory. This stall type represents about 92.3% of the total average of 49.1 cycles between issuing two instructions.\n*Estimated Speedup (global): 12.96%*\n\nℹ️ **INFO**: Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "144d071405684ec3bb0a8259ae566518": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "16b4cfe3f86041eea5ee6471175ddce6": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "17149f4044374e30b0926ee8683a08b6": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "184b041944b040c1a715b4b892fe3405": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "2566b43ad6544ff5b3535e1bcf848394": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_81ee7055ea504002a61d86e34b9d2564",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Instruction\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Avg. Executed Instructions Per Scheduler | inst | 1,610,547.20 |\n| Executed Instructions | inst | 257,687,552 |\n| Avg. Issued Instructions Per Scheduler | inst | 1,610,606.20 |\n| Issued Instructions | inst | 257,696,992 |\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "2be329446de8406aa008cba59ea2cfc0": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_40e55caada4b4d4280e5d57bbf41daf7",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Launch\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Block Size |  | 256 |\n| Function Cache Configuration |  | CachePreferNone |\n| Grid Size |  | 16,384 |\n| Registers Per Thread | register/thread | 32 |\n| Shared Memory Configuration Size | Kbyte | 32.77 |\n| Driver Shared Memory Per Block | byte/block | 0 |\n| Dynamic Shared Memory Per Block | byte/block | 0 |\n| Static Shared Memory Per Block | byte/block | 0 |\n| # SMs | SM | 40 |\n| Threads | thread | 4,194,304 |\n| Uses Green Context |  | 0 |\n| Waves Per SM |  | 102.40 |\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "3b435d6a3d0148bc82ac4c72334d03ab": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "3bdb7a5b68b0484ebfa4584fe4440cef": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "3d25f6fa3bd447d38ef0619d34129054": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "3fb5720992194190a644475c35bb3962": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "40c9bccee18f489b8a60f0bcc66b380e": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_4812fe63767e41cc9cf82c9de944b383",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Scheduler\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| One or More Eligible | % | 16.02 |\n| Issued Warp Per Scheduler |  | 0.16 |\n| No Eligible | % | 83.98 |\n| Active Warps Per Scheduler | warp | 7.87 |\n| Eligible Warps Per Scheduler | warp | 0.27 |\n\n🔧 **OPTIMIZATION**: Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only issues an instruction every 6.2 cycles. This might leave hardware resources underutilized and may lead to less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of 7.87 active warps per scheduler, but only an average of 0.27 warps were eligible per cycle. Eligible warps are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible warp results in no instruction being issued and the issue slot remains unused. To increase the number of eligible warps, avoid possible load imbalances due to highly different execution durations per warp. Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.\n*Estimated Speedup (local): 12.96%*\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "40e55caada4b4d4280e5d57bbf41daf7": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "4812fe63767e41cc9cf82c9de944b383": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "520d86e9e9e142d2a9404675f4acab09": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "5862752de4db4eafa9c1894aa8c85385": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "5bc24b339fc8419da15158a611ccc7c0": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_3d25f6fa3bd447d38ef0619d34129054",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Speed Of Light\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| DRAM Frequency | Ghz | 5.00 |\n| SM Frequency | Mhz | 585.00 |\n| Elapsed Cycles | cycle | 201,083,989 |\n| Memory Throughput | % | 61.70 |\n| DRAM Throughput | % | 61.70 |\n| Duration | ms | 343.73 |\n| L1/TEX Cache Throughput | % | 17.89 |\n| L2 Cache Throughput | % | 8.42 |\n| SM Active Cycles | cycle | 201,130,599.78 |\n| Compute (SM) Throughput | % | 1.28 |\n\n🔧 **OPTIMIZATION**: Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or whether there are values you can (re)compute.\n\nℹ️ **INFO**: The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline analysis.\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "678ec7c648a94af9a6d16e4bc22d743c": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "6be7ca5f9141465cb4b5126001b6c298": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DropdownModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DropdownModel",
-      "_options_labels": [
-       "copy_optimized"
-      ],
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "DropdownView",
-      "description": "Kernel:",
-      "description_tooltip": null,
-      "disabled": false,
-      "index": 0,
-      "layout": "IPY_MODEL_731f455485da4a31a8455bc7e3cdf1a8",
-      "style": "IPY_MODEL_adfba531343149f7a215ec7ec30bf8e1"
-     }
-    },
-    "7190fa5e9c1d4db5b53a19a00a43e29b": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_e2f9cbd9e16f49b7a84dc9bfe8808107",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Occupancy\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Block Limit SM | block | 16 |\n| Block Limit Registers | block | 8 |\n| Block Limit Shared Mem | block | 16 |\n| Block Limit Warps | block | 4 |\n| Theoretical Active Warps per SM | warp | 32 |\n| Theoretical Occupancy | % | 100 |\n| Achieved Occupancy | % | 97.54 |\n| Achieved Active Warps Per SM | warp | 31.21 |\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "731f455485da4a31a8455bc7e3cdf1a8": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": "400px"
-     }
-    },
-    "7bd502fafaad4a1d9d5c7db9c1d54b9e": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_16b4cfe3f86041eea5ee6471175ddce6",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "# copy_optimized",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       },
-       {
-        "data": {
-         "application/vnd.jupyter.widget-view+json": {
-          "model_id": "d7dd0302a2cb475388f8fb2c5cf92f5d",
-          "version_major": 2,
-          "version_minor": 0
-         },
-         "text/plain": "Tab(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output…"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "7ed48d6abaab4e95a347fa5c7b76bfac": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "81ee7055ea504002a61d86e34b9d2564": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "8234d3926a0c413a996b1965cf862551": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "84fd4b1277464e378c55ceb8b0628045": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_8fa15d63997f485795c9541d5151a876",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Launch\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Block Size |  | 256 |\n| Function Cache Configuration |  | CachePreferNone |\n| Grid Size |  | 16,384 |\n| Registers Per Thread | register/thread | 35 |\n| Shared Memory Configuration Size | Kbyte | 32.77 |\n| Driver Shared Memory Per Block | byte/block | 0 |\n| Dynamic Shared Memory Per Block | byte/block | 0 |\n| Static Shared Memory Per Block | byte/block | 0 |\n| # SMs | SM | 40 |\n| Threads | thread | 4,194,304 |\n| Uses Green Context |  | 0 |\n| Waves Per SM |  | 102.40 |\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "8713dc92860b464aa1ce100891e33ed7": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_d74423ab0b3d416187826deaba0a57ba",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Summary\n\n### Speed Of Light\n\nℹ️ **INFO**: The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To further improve performance, work will likely need to be shifted from the most utilized to another unit. Start by analyzing DRAM in the Memory Workload Analysis section.\n\nℹ️ **INFO**: The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline analysis.\n\n### Memory Workload\n\n⚠️ **WARNING**: The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an additional metric could enable the rule to provide more guidance.\n\n### Compute Workload\n\nℹ️ **INFO**: ALU is the highest-utilized pipeline (23.8%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck.\n\n### Scheduler\n\n🔧 **OPTIMIZATION**: Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only issues an instruction every 6.2 cycles. This might leave hardware resources underutilized and may lead to less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of 7.87 active warps per scheduler, but only an average of 0.27 warps were eligible per cycle. Eligible warps are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible warp results in no instruction being issued and the issue slot remains unused. To increase the number of eligible warps, avoid possible load imbalances due to highly different execution durations per warp. Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.\n*Estimated Speedup (local): 12.96%*\n\n### Warp State\n\n🔧 **OPTIMIZATION**: On average, each warp of this kernel spends 45.3 cycles being stalled waiting for a scoreboard dependency on a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently used data to shared memory. This stall type represents about 92.3% of the total average of 49.1 cycles between issuing two instructions.\n*Estimated Speedup (global): 12.96%*\n\nℹ️ **INFO**: Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "88670f4cd80d4823962f681c5db0b05c": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": "400px"
-     }
-    },
-    "8bd862cae4a34d9eb8c6e0cbaba458fd": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "8d0b84554fc14eb8aa1d73db53e7446c": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_8234d3926a0c413a996b1965cf862551",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## PM Sampling\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Maximum Buffer Size | Mbyte | 3.47 |\n| Dropped Samples | sample | 0 |\n| Maximum Sampling Interval | cycle | 640,000 |\n| # Pass Groups |  | 1 |\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "8fa15d63997f485795c9541d5151a876": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "918774a98d094a6b851bbc1df702ca4d": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "a3c5590a108e4630ac47a029112fb8da": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_e2c606a3ec4b41bc85095363bed2a8bb",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Compute & Memory Distribution\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Average DRAM Active Cycles | cycle | 1,059,639,779 |\n| Total DRAM Elapsed Cycles | cycle | 13,739,264,000 |\n| Average L1 Active Cycles | cycle | 201,130,599.78 |\n| Total L1 Elapsed Cycles | cycle | 8,048,353,288 |\n| Average L2 Active Cycles | cycle | 291,798,067.81 |\n| Total L2 Elapsed Cycles | cycle | 9,404,542,720 |\n| Average SM Active Cycles | cycle | 201,130,599.78 |\n| Total SM Elapsed Cycles | cycle | 8,048,353,288 |\n| Average SMSP Active Cycles | cycle | 201,107,006.62 |\n| Total SMSP Elapsed Cycles | cycle | 32,193,413,152 |\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "a7948ce618014b68a77c29a97b5f8089": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_918774a98d094a6b851bbc1df702ca4d",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## PM Sampling\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Maximum Buffer Size | Mbyte | 3.28 |\n| Dropped Samples | sample | 0 |\n| Maximum Sampling Interval | cycle | 160,000 |\n| # Pass Groups |  | 1 |\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "a9bbc802cafc48aca6c853e0ebd6806e": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "ad7d9090fefd443fb56b97437c207ee8": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_3bdb7a5b68b0484ebfa4584fe4440cef",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Compute Workload\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Executed Ipc Active | inst/cycle | 0.64 |\n| Executed Ipc Elapsed | inst/cycle | 0.64 |\n| Issue Slots Busy | % | 16.02 |\n| Issued Ipc Active | inst/cycle | 0.64 |\n| SM Busy | % | 23.83 |\n\nℹ️ **INFO**: ALU is the highest-utilized pipeline (23.8%) based on active cycles, taking into account the rates of its different instructions. It executes integer and logic operations. It is well-utilized, but should not be a bottleneck.\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "adfba531343149f7a215ec7ec30bf8e1": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": "initial"
-     }
-    },
-    "b298377cb334442fb3be8dd55cfe8989": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_7ed48d6abaab4e95a347fa5c7b76bfac",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Source Counters\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Branch Instructions Ratio | % | 0.01 |\n| Branch Instructions | inst | 3,014,656 |\n| Branch Efficiency | % | 100 |\n| Avg. Divergent Branches |  | 0 |\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "b457bbb2890e4f6ea0d007d3d05d555b": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "b47fd7b6a0154f21bc182a368783f018": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DropdownModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DropdownModel",
-      "_options_labels": [
-       "copy_blocked"
-      ],
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "DropdownView",
-      "description": "Kernel:",
-      "description_tooltip": null,
-      "disabled": false,
-      "index": 0,
-      "layout": "IPY_MODEL_88670f4cd80d4823962f681c5db0b05c",
-      "style": "IPY_MODEL_efbee61b66c74e939e6ba9eb177b5104"
-     }
-    },
-    "bca08631512947a9bb7678cae262caa1": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_678ec7c648a94af9a6d16e4bc22d743c",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Source Counters\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Branch Instructions Ratio | % | 0.01 |\n| Branch Instructions | inst | 2,490,368 |\n| Branch Efficiency | % | 100 |\n| Avg. Divergent Branches |  | 0 |\n\n🔧 **OPTIMIZATION**: This kernel has uncoalesced global accesses resulting in a total of 402653184 excessive sectors (75% of the total 536870912 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The CUDA Programming Guide (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional information on reducing uncoalesced device memory accesses.\n*Estimated Speedup (global): 74.47%*\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "bd3d14f42d7c4e058cf86091a34e0495": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "TabModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "TabModel",
-      "_titles": {
-       "0": "Summary",
-       "1": "Speed Of Light",
-       "2": "Memory Workload",
-       "3": "Compute Workload",
-       "4": "Compute & Memory Distribution",
-       "5": "Scheduler",
-       "6": "Warp State",
-       "7": "Instruction",
-       "8": "Launch",
-       "9": "PM Sampling",
-       "10": "Occupancy",
-       "11": "Source Counters"
-      },
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "TabView",
-      "box_style": "",
-      "children": [
-       "IPY_MODEL_fd81c004921e40acb4a5c89ce6b59345",
-       "IPY_MODEL_5bc24b339fc8419da15158a611ccc7c0",
-       "IPY_MODEL_e0520258747c44c3b3c03df575d08958",
-       "IPY_MODEL_f02b7d8187324ecda1befc6000394fb8",
-       "IPY_MODEL_a3c5590a108e4630ac47a029112fb8da",
-       "IPY_MODEL_f8282e5d62ba4fdfbb055930aae07bb0",
-       "IPY_MODEL_ddfcf54f17cb48b18c316d6fbc3f2df2",
-       "IPY_MODEL_2566b43ad6544ff5b3535e1bcf848394",
-       "IPY_MODEL_2be329446de8406aa008cba59ea2cfc0",
-       "IPY_MODEL_8d0b84554fc14eb8aa1d73db53e7446c",
-       "IPY_MODEL_7190fa5e9c1d4db5b53a19a00a43e29b",
-       "IPY_MODEL_bca08631512947a9bb7678cae262caa1"
-      ],
-      "layout": "IPY_MODEL_520d86e9e9e142d2a9404675f4acab09",
-      "selected_index": 0
-     }
-    },
-    "c0ac7614bdb74083a784cad1babfd7f4": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_0495f43209454c7dade96e09491b0a59",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Memory Workload\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Memory Throughput | Gbyte/s | 278.16 |\n| Mem Busy | % | 27.59 |\n| Max Bandwidth | % | 87.04 |\n| L1/TEX Hit Rate | % | 0 |\n| L2 Hit Rate | % | 50.00 |\n| Mem Pipes Busy | % | 8.13 |\n\n⚠️ **WARNING**: The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an additional metric could enable the rule to provide more guidance.\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "c8a882d190254c939506955708d0a0aa": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "cfffbaee02274c83ac30daeac3c63f6e": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_b457bbb2890e4f6ea0d007d3d05d555b",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Occupancy\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Block Limit SM | block | 16 |\n| Block Limit Registers | block | 6 |\n| Block Limit Shared Mem | block | 16 |\n| Block Limit Warps | block | 4 |\n| Theoretical Active Warps per SM | warp | 32 |\n| Theoretical Occupancy | % | 100 |\n| Achieved Occupancy | % | 98.34 |\n| Achieved Active Warps Per SM | warp | 31.47 |\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "d74423ab0b3d416187826deaba0a57ba": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "d7dd0302a2cb475388f8fb2c5cf92f5d": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "TabModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "TabModel",
-      "_titles": {
-       "0": "Summary",
-       "1": "Speed Of Light",
-       "2": "Memory Workload",
-       "3": "Compute Workload",
-       "4": "Compute & Memory Distribution",
-       "5": "Scheduler",
-       "6": "Warp State",
-       "7": "Instruction",
-       "8": "Launch",
-       "9": "PM Sampling",
-       "10": "Occupancy",
-       "11": "Source Counters"
-      },
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "TabView",
-      "box_style": "",
-      "children": [
-       "IPY_MODEL_8713dc92860b464aa1ce100891e33ed7",
-       "IPY_MODEL_e3027ae6d73b4bca9df34d8bd61155c3",
-       "IPY_MODEL_c0ac7614bdb74083a784cad1babfd7f4",
-       "IPY_MODEL_ad7d9090fefd443fb56b97437c207ee8",
-       "IPY_MODEL_f3b8695ddc6a445ebc82dde76f8d1f79",
-       "IPY_MODEL_40c9bccee18f489b8a60f0bcc66b380e",
-       "IPY_MODEL_143463ba065a4b96a6adbe97880b6330",
-       "IPY_MODEL_fbcda75ba8764222b2acf028e8478b78",
-       "IPY_MODEL_84fd4b1277464e378c55ceb8b0628045",
-       "IPY_MODEL_a7948ce618014b68a77c29a97b5f8089",
-       "IPY_MODEL_cfffbaee02274c83ac30daeac3c63f6e",
-       "IPY_MODEL_b298377cb334442fb3be8dd55cfe8989"
-      ],
-      "layout": "IPY_MODEL_a9bbc802cafc48aca6c853e0ebd6806e",
-      "selected_index": 0
-     }
-    },
-    "da5ad97524f4499cb466a461c9e9a014": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_8bd862cae4a34d9eb8c6e0cbaba458fd",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "# copy_blocked",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       },
-       {
-        "data": {
-         "application/vnd.jupyter.widget-view+json": {
-          "model_id": "bd3d14f42d7c4e058cf86091a34e0495",
-          "version_major": 2,
-          "version_minor": 0
-         },
-         "text/plain": "Tab(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output…"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "dbe2bf8c7add48d8bf6e1f13b603fd38": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "ddfcf54f17cb48b18c316d6fbc3f2df2": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_184b041944b040c1a715b4b892fe3405",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Warp State\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Warp Cycles Per Issued Instruction | cycle | 974.92 |\n| Warp Cycles Per Executed Instruction | cycle | 974.96 |\n| Avg. Active Threads Per Warp |  | 32 |\n| Avg. Not Predicated Off Threads Per Warp |  | 31.95 |\n\n🔧 **OPTIMIZATION**: On average, each warp of this kernel spends 503.1 cycles being stalled waiting for a scoreboard dependency on a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently used data to shared memory. This stall type represents about 51.6% of the total average of 974.9 cycles between issuing two instructions.\n*Estimated Speedup (global): 38.3%*\n\n🔧 **OPTIMIZATION**: On average, each warp of this kernel spends 463.7 cycles being stalled waiting for the L1 instruction queue for local and global (LG) memory operations to be not full. Typically, this stall occurs only when executing local or global memory instructions extremely frequently. Avoid redundant global memory accesses. Try to avoid using thread-local memory by checking if dynamically indexed arrays are declared in local scope, of if the kernel has excessive register pressure causing by spills. If applicable, consider combining multiple lower-width memory operations into fewer wider memory operations and try interleaving memory operations and math instructions. This stall type represents about 47.6% of the total average of 974.9 cycles between issuing two instructions.\n*Estimated Speedup (global): 38.3%*\n\nℹ️ **INFO**: Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "e0520258747c44c3b3c03df575d08958": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_06d6f26f29494f45abcd43b287163314",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Memory Workload\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Memory Throughput | Gbyte/s | 197.30 |\n| Mem Busy | % | 6.75 |\n| Max Bandwidth | % | 61.70 |\n| L1/TEX Hit Rate | % | 0.05 |\n| L2 Hit Rate | % | 2.36 |\n| Mem Pipes Busy | % | 1.28 |\n\n⚠️ **WARNING**: The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an additional metric could enable the rule to provide more guidance.\n\n🔧 **OPTIMIZATION**: The memory access pattern for global loads from DRAM might not be optimal. On average, only 8.0 of the 32 bytes transmitted per sector are utilized by each thread. This applies to the 99.6% of sectors missed in L2. This could possibly be caused by a stride between threads. Check the Source Counters section for uncoalesced global loads.\n*Estimated Speedup (global): 46.07%*\n\n🔧 **OPTIMIZATION**: The memory access pattern for global stores to DRAM might not be optimal. On average, only 8.0 of the 32 bytes transmitted per sector are utilized by each thread. This applies to the 95.6% of sectors missed in L2. This could possibly be caused by a stride between threads. Check the Source Counters section for uncoalesced global stores.\n*Estimated Speedup (global): 44.22%*\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "e2c606a3ec4b41bc85095363bed2a8bb": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "e2f9cbd9e16f49b7a84dc9bfe8808107": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "e3027ae6d73b4bca9df34d8bd61155c3": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_dbe2bf8c7add48d8bf6e1f13b603fd38",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Speed Of Light\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| DRAM Frequency | Ghz | 4.99 |\n| SM Frequency | Mhz | 585.00 |\n| Elapsed Cycles | cycle | 10,401,526 |\n| Memory Throughput | % | 87.04 |\n| DRAM Throughput | % | 87.04 |\n| Duration | ms | 17.78 |\n| L1/TEX Cache Throughput | % | 36.36 |\n| L2 Cache Throughput | % | 27.59 |\n| SM Active Cycles | cycle | 10,389,825.82 |\n| Compute (SM) Throughput | % | 23.80 |\n\nℹ️ **INFO**: The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To further improve performance, work will likely need to be shifted from the most utilized to another unit. Start by analyzing DRAM in the Memory Workload Analysis section.\n\nℹ️ **INFO**: The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline analysis.\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "efbee61b66c74e939e6ba9eb177b5104": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": "initial"
-     }
-    },
-    "f02b7d8187324ecda1befc6000394fb8": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_5862752de4db4eafa9c1894aa8c85385",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Compute Workload\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Executed Ipc Active | inst/cycle | 0.03 |\n| Executed Ipc Elapsed | inst/cycle | 0.03 |\n| Issue Slots Busy | % | 0.80 |\n| Issued Ipc Active | inst/cycle | 0.03 |\n| SM Busy | % | 1.11 |\n\n🔧 **OPTIMIZATION**: All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.\n*Estimated Speedup (local): 98.89%*\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "f3b8695ddc6a445ebc82dde76f8d1f79": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_c8a882d190254c939506955708d0a0aa",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Compute & Memory Distribution\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Average DRAM Active Cycles | cycle | 77,278,172 |\n| Total DRAM Elapsed Cycles | cycle | 710,265,856 |\n| Average L1 Active Cycles | cycle | 10,389,825.82 |\n| Total L1 Elapsed Cycles | cycle | 415,992,616 |\n| Average L2 Active Cycles | cycle | 15,185,970.41 |\n| Total L2 Elapsed Cycles | cycle | 486,470,688 |\n| Average SM Active Cycles | cycle | 10,389,825.82 |\n| Total SM Elapsed Cycles | cycle | 415,992,616 |\n| Average SMSP Active Cycles | cycle | 10,389,220.56 |\n| Total SMSP Elapsed Cycles | cycle | 1,663,970,464 |\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "f8282e5d62ba4fdfbb055930aae07bb0": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_3fb5720992194190a644475c35bb3962",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Scheduler\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| One or More Eligible | % | 0.80 |\n| Issued Warp Per Scheduler |  | 0.01 |\n| No Eligible | % | 99.20 |\n| Active Warps Per Scheduler | warp | 7.81 |\n| Eligible Warps Per Scheduler | warp | 0.01 |\n\n🔧 **OPTIMIZATION**: Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only issues an instruction every 124.9 cycles. This might leave hardware resources underutilized and may lead to less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of 7.81 active warps per scheduler, but only an average of 0.01 warps were eligible per cycle. Eligible warps are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible warp results in no instruction being issued and the issue slot remains unused. To increase the number of eligible warps, avoid possible load imbalances due to highly different execution durations per warp. Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.\n*Estimated Speedup (local): 38.3%*\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "fbcda75ba8764222b2acf028e8478b78": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_144d071405684ec3bb0a8259ae566518",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Instruction\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Avg. Executed Instructions Per Scheduler | inst | 1,664,614.40 |\n| Executed Instructions | inst | 266,338,304 |\n| Avg. Issued Instructions Per Scheduler | inst | 1,664,650.29 |\n| Issued Instructions | inst | 266,344,046 |\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "fd81c004921e40acb4a5c89ce6b59345": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_3b435d6a3d0148bc82ac4c72334d03ab",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Summary\n\n### Speed Of Light\n\n🔧 **OPTIMIZATION**: Memory is more heavily utilized than Compute: Look at the Memory Workload Analysis section to identify the DRAM bottleneck. Check memory replay (coalescing) metrics to make sure you're efficiently utilizing the bytes transferred. Also consider whether it is possible to do more work per memory access (kernel fusion) or whether there are values you can (re)compute.\n\nℹ️ **INFO**: The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline analysis.\n\n### Memory Workload\n\n⚠️ **WARNING**: The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an additional metric could enable the rule to provide more guidance.\n\n🔧 **OPTIMIZATION**: The memory access pattern for global loads from DRAM might not be optimal. On average, only 8.0 of the 32 bytes transmitted per sector are utilized by each thread. This applies to the 99.6% of sectors missed in L2. This could possibly be caused by a stride between threads. Check the Source Counters section for uncoalesced global loads.\n*Estimated Speedup (global): 46.07%*\n\n🔧 **OPTIMIZATION**: The memory access pattern for global stores to DRAM might not be optimal. On average, only 8.0 of the 32 bytes transmitted per sector are utilized by each thread. This applies to the 95.6% of sectors missed in L2. This could possibly be caused by a stride between threads. Check the Source Counters section for uncoalesced global stores.\n*Estimated Speedup (global): 44.22%*\n\n### Compute Workload\n\n🔧 **OPTIMIZATION**: All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.\n*Estimated Speedup (local): 98.89%*\n\n### Scheduler\n\n🔧 **OPTIMIZATION**: Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only issues an instruction every 124.9 cycles. This might leave hardware resources underutilized and may lead to less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of 7.81 active warps per scheduler, but only an average of 0.01 warps were eligible per cycle. Eligible warps are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible warp results in no instruction being issued and the issue slot remains unused. To increase the number of eligible warps, avoid possible load imbalances due to highly different execution durations per warp. Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.\n*Estimated Speedup (local): 38.3%*\n\n### Warp State\n\n🔧 **OPTIMIZATION**: On average, each warp of this kernel spends 503.1 cycles being stalled waiting for a scoreboard dependency on a L1TEX (local, global, surface, texture) operation. Find the instruction producing the data being waited upon to identify the culprit. To reduce the number of cycles waiting on L1TEX data accesses verify the memory access patterns are optimal for the target architecture, attempt to increase cache hit rates by increasing data locality (coalescing), or by changing the cache configuration. Consider moving frequently used data to shared memory. This stall type represents about 51.6% of the total average of 974.9 cycles between issuing two instructions.\n*Estimated Speedup (global): 38.3%*\n\n🔧 **OPTIMIZATION**: On average, each warp of this kernel spends 463.7 cycles being stalled waiting for the L1 instruction queue for local and global (LG) memory operations to be not full. Typically, this stall occurs only when executing local or global memory instructions extremely frequently. Avoid redundant global memory accesses. Try to avoid using thread-local memory by checking if dynamically indexed arrays are declared in local scope, of if the kernel has excessive register pressure causing by spills. If applicable, consider combining multiple lower-width memory operations into fewer wider memory operations and try interleaving memory operations and math instructions. This stall type represents about 47.6% of the total average of 974.9 cycles between issuing two instructions.\n*Estimated Speedup (global): 38.3%*\n\nℹ️ **INFO**: Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.\n\n### Source Counters\n\n🔧 **OPTIMIZATION**: This kernel has uncoalesced global accesses resulting in a total of 402653184 excessive sectors (75% of the total 536870912 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The CUDA Programming Guide (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional information on reducing uncoalesced device memory accesses.\n*Estimated Speedup (global): 74.47%*\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    }
-   }
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/accelerated-python/notebooks/kernels/solutions/41__kernel_authoring__book_histogram__SOLUTION.ipynb b/tutorials/accelerated-python/notebooks/kernels/solutions/41__kernel_authoring__book_histogram__SOLUTION.ipynb
index 478f4768..e16f9efc 100644
--- a/tutorials/accelerated-python/notebooks/kernels/solutions/41__kernel_authoring__book_histogram__SOLUTION.ipynb
+++ b/tutorials/accelerated-python/notebooks/kernels/solutions/41__kernel_authoring__book_histogram__SOLUTION.ipynb
@@ -822,10 +822,12 @@
   "accelerator": "GPU",
   "colab": {
    "gpuType": "T4",
-   "provenance": []
+   "provenance": [],
+   "toc_visible": true
   },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
+   "language": "python",
    "name": "python3"
   },
   "language_info": {
@@ -838,2629 +840,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.7"
-  },
-  "widgets": {
-   "application/vnd.jupyter.widget-state+json": {
-    "05bad554df84498daf462d395e980f52": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_f1930ef855f844ee947a1c9b07ed95a4",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## WorkloadDistribution\n\n🔧 **OPTIMIZATION**: One or more L2 Slices have a much higher number of active cycles than the average number of active cycles. Maximum instance value is 89.83% above the average, while the minimum instance value is 60.99% below the average.\n*Estimated Speedup (global): 9.132%*\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "061f8098ad9a430aad63a106b4629fb5": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": "400px"
-     }
-    },
-    "0952fc28a1244c46befc2bed2cd7cb4c": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "0d706bed98bf41eeb56205638c01f1ec": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_e9a4e840c12041809843a134cc29d791",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Speed Of Light\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| DRAM Frequency | Ghz | 5.00 |\n| SM Frequency | Mhz | 584.99 |\n| Elapsed Cycles | cycle | 4,835,673 |\n| Memory Throughput | % | 6.60 |\n| DRAM Throughput | % | 0.79 |\n| Duration | ms | 8.27 |\n| L1/TEX Cache Throughput | % | 13.20 |\n| L2 Cache Throughput | % | 5.65 |\n| SM Active Cycles | cycle | 4,808,396.12 |\n| Compute (SM) Throughput | % | 1.44 |\n\n🔧 **OPTIMIZATION**: This kernel exhibits low compute throughput and memory bandwidth utilization relative to the peak performance of this device. Achieved compute throughput and/or memory bandwidth below 60.0% of peak typically indicate latency issues. Look at Scheduler Statistics and Warp State Statistics for potential reasons.\n\nℹ️ **INFO**: The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline analysis.\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "0dcbea49322640a28c4f3459ea50c204": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_cb604331c58f43bda3a0135cc761ef22",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Compute & Memory Distribution\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Average DRAM Active Cycles | cycle | 417,585 |\n| Total DRAM Elapsed Cycles | cycle | 11,925,504 |\n| Average L1 Active Cycles | cycle | 158,097.65 |\n| Total L1 Elapsed Cycles | cycle | 6,961,192 |\n| Average L2 Active Cycles | cycle | 184,213.25 |\n| Total L2 Elapsed Cycles | cycle | 8,208,544 |\n| Average SM Active Cycles | cycle | 158,097.65 |\n| Total SM Elapsed Cycles | cycle | 6,961,192 |\n| Average SMSP Active Cycles | cycle | 158,157.98 |\n| Total SMSP Elapsed Cycles | cycle | 27,844,768 |\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "0f94f91896304b3c861750946febeccc": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "1006aafeb85f4ae9b1cabe28f33184f4": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "15e004fe6f924bf5887331f0ad9da19f": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_63f8786f07e74f4caef3ebf115110daa",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Warp State\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Warp Cycles Per Issued Instruction | cycle | 2,467.51 |\n| Warp Cycles Per Executed Instruction | cycle | 2,473.27 |\n| Avg. Active Threads Per Warp |  | 32 |\n| Avg. Not Predicated Off Threads Per Warp |  | 32 |\n\n🔧 **OPTIMIZATION**: On average, each warp of this kernel spends 1886.3 cycles being stalled waiting for the L1 instruction queue for local and global (LG) memory operations to be not full. Typically, this stall occurs only when executing local or global memory instructions extremely frequently. Avoid redundant global memory accesses. Try to avoid using thread-local memory by checking if dynamically indexed arrays are declared in local scope, of if the kernel has excessive register pressure causing by spills. If applicable, consider combining multiple lower-width memory operations into fewer wider memory operations and try interleaving memory operations and math instructions. This stall type represents about 76.4% of the total average of 2467.5 cycles between issuing two instructions.\n*Estimated Speedup (global): 76.45%*\n\nℹ️ **INFO**: Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "16d240c1461a444ea9cdbbd4a856e8f3": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_fd8251af0c4743e29b89a76a73a27af4",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## WorkloadDistribution\n\n🔧 **OPTIMIZATION**: One or more SMs have a much higher number of active cycles than the average number of active cycles. Maximum instance value is 8.44% above the average, while the minimum instance value is 3.29% below the average.\n*Estimated Speedup (global): 7.666%*\n\n🔧 **OPTIMIZATION**: One or more SMSPs have a much higher number of active cycles than the average number of active cycles. Maximum instance value is 8.37% above the average, while the minimum instance value is 3.25% below the average.\n*Estimated Speedup (global): 7.609%*\n\n🔧 **OPTIMIZATION**: One or more L1 Slices have a much higher number of active cycles than the average number of active cycles. Maximum instance value is 8.44% above the average, while the minimum instance value is 3.29% below the average.\n*Estimated Speedup (global): 7.666%*\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "179ce686ac1f4e60b5d34951446ad599": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_83f0975a18e0401d859f001422aebc74",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Compute & Memory Distribution\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Average DRAM Active Cycles | cycle | 326,938.50 |\n| Total DRAM Elapsed Cycles | cycle | 330,405,888 |\n| Average L1 Active Cycles | cycle | 4,808,396.12 |\n| Total L1 Elapsed Cycles | cycle | 193,385,088 |\n| Average L2 Active Cycles | cycle | 718,498.38 |\n| Total L2 Elapsed Cycles | cycle | 226,160,704 |\n| Average SM Active Cycles | cycle | 4,808,396.12 |\n| Total SM Elapsed Cycles | cycle | 193,385,088 |\n| Average SMSP Active Cycles | cycle | 4,807,310.28 |\n| Total SMSP Elapsed Cycles | cycle | 773,540,352 |\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "1edda20eae6441a99d31a1105f30b8c6": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_607214ca07174871b459e301eb7ad699",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Source Counters\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Branch Instructions Ratio | % | 0.03 |\n| Branch Instructions | inst | 58,624 |\n| Branch Efficiency | % | 0 |\n| Avg. Divergent Branches |  | 0 |\n\n🔧 **OPTIMIZATION**: This kernel has uncoalesced global accesses resulting in a total of 4988724 excessive sectors (68% of the total 7329833 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The CUDA Programming Guide (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional information on reducing uncoalesced device memory accesses.\n*Estimated Speedup (global): 6.919%*\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "24c64a3e43f44f1bb65cee915d1aa9ce": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "277a7612626b44a0b009d28fe486ab3d": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": "400px"
-     }
-    },
-    "29f54431514146cea6e08d352058ccfd": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_a48f630bb3584ce294f2e79d60c4b40e",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "# histogram_localized",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       },
-       {
-        "data": {
-         "application/vnd.jupyter.widget-view+json": {
-          "model_id": "a122e99121234ce396fa20aa6e462574",
-          "version_major": 2,
-          "version_minor": 0
-         },
-         "text/plain": "Tab(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output…"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "2a50582b630a4e63a41f10adb53303e2": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_b6bfff789d354bdd8c630826eaae63b6",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Memory Workload\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Memory Throughput | Gbyte/s | 2.53 |\n| Mem Busy | % | 4.12 |\n| Max Bandwidth | % | 6.60 |\n| L1/TEX Hit Rate | % | 26.05 |\n| L2 Hit Rate | % | 94.95 |\n| Mem Pipes Busy | % | 1.44 |\n\n⚠️ **WARNING**: The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an additional metric could enable the rule to provide more guidance.\n\n🔧 **OPTIMIZATION**: The memory access pattern for global loads from L1TEX might not be optimal. On average, only 4.0 of the 32 bytes transmitted per sector are utilized by each thread. This could possibly be caused by a stride between threads. Check the Source Counters section for uncoalesced global loads.\n*Estimated Speedup (global): 11.55%*\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "2a64bde12026429db15794fad73e7759": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_bb74cb5d3eb042e0b148a8554236a950",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Memory Workload\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Memory Throughput | Gbyte/s | 89.06 |\n| Mem Busy | % | 45.20 |\n| Max Bandwidth | % | 28.21 |\n| L1/TEX Hit Rate | % | 0 |\n| L2 Hit Rate | % | 50.38 |\n| Mem Pipes Busy | % | 28.21 |\n\n⚠️ **WARNING**: The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an additional metric could enable the rule to provide more guidance.\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "2bf258c6b7734068a20ed1fe5bcacf61": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_f2ef2b3576a54144a9faadd4ea42f0f4",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Scheduler\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| One or More Eligible | % | 7.55 |\n| Issued Warp Per Scheduler |  | 0.08 |\n| No Eligible | % | 92.45 |\n| Active Warps Per Scheduler | warp | 7.66 |\n| Eligible Warps Per Scheduler | warp | 0.27 |\n\n🔧 **OPTIMIZATION**: Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only issues an instruction every 13.2 cycles. This might leave hardware resources underutilized and may lead to less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of 7.66 active warps per scheduler, but only an average of 0.27 warps were eligible per cycle. Eligible warps are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible warp results in no instruction being issued and the issue slot remains unused. To increase the number of eligible warps, avoid possible load imbalances due to highly different execution durations per warp. Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.\n*Estimated Speedup (local): 54.8%*\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "2ebcfbe930ca4061a86c8f1ea87b8eb0": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "39e58d26a4a84b99ad768f93a7a3a8a5": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_f440961da49a4e6c9185ae5f55373df3",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Occupancy\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Block Limit SM | block | 16 |\n| Block Limit Registers | block | 5 |\n| Block Limit Shared Mem | block | 16 |\n| Block Limit Warps | block | 2 |\n| Theoretical Active Warps per SM | warp | 32 |\n| Theoretical Occupancy | % | 100 |\n| Achieved Occupancy | % | 91.83 |\n| Achieved Active Warps Per SM | warp | 29.39 |\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "4d9753b4b11f425d9476d4f69932988c": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_24c64a3e43f44f1bb65cee915d1aa9ce",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Scheduler\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| One or More Eligible | % | 0.30 |\n| Issued Warp Per Scheduler |  | 0.00 |\n| No Eligible | % | 99.70 |\n| Active Warps Per Scheduler | warp | 7.35 |\n| Eligible Warps Per Scheduler | warp | 0.01 |\n\n🔧 **OPTIMIZATION**: Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only issues an instruction every 335.6 cycles. This might leave hardware resources underutilized and may lead to less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of 7.35 active warps per scheduler, but only an average of 0.01 warps were eligible per cycle. Eligible warps are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible warp results in no instruction being issued and the issue slot remains unused. To increase the number of eligible warps, avoid possible load imbalances due to highly different execution durations per warp. Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.\n*Estimated Speedup (local): 93.4%*\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "50cc8f97b44549faa0f202361f5f7e6c": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_0f94f91896304b3c861750946febeccc",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Launch\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Block Size |  | 512 |\n| Function Cache Configuration |  | CachePreferNone |\n| Grid Size |  | 3,664 |\n| Registers Per Thread | register/thread | 18 |\n| Shared Memory Configuration Size | Kbyte | 32.77 |\n| Driver Shared Memory Per Block | byte/block | 0 |\n| Dynamic Shared Memory Per Block | byte/block | 0 |\n| Static Shared Memory Per Block | byte/block | 0 |\n| # SMs | SM | 40 |\n| Threads | thread | 1,875,968 |\n| Uses Green Context |  | 0 |\n| Waves Per SM |  | 45.80 |\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "5e77669944a346e6b628004967e7ba75": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": "initial"
-     }
-    },
-    "607214ca07174871b459e301eb7ad699": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "63f8786f07e74f4caef3ebf115110daa": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "64037d07a15d4e238471eeccce3d0ead": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "6439cea566ff46e08f08022e1c0d40d6": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_64037d07a15d4e238471eeccce3d0ead",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## PM Sampling\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Maximum Buffer Size | Mbyte | 1.05 |\n| Dropped Samples | sample | 0 |\n| Maximum Sampling Interval | cycle | 20,000 |\n| # Pass Groups |  | 1 |\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "650c0471ffea4b4bad75444a904978b1": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "6f476f6ca14a45699799d56dedbfc4a9": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": "initial"
-     }
-    },
-    "72eba4c5f11b49fb89df581268cc1ddc": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DropdownModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DropdownModel",
-      "_options_labels": [
-       "histogram_localized"
-      ],
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "DropdownView",
-      "description": "Kernel:",
-      "description_tooltip": null,
-      "disabled": false,
-      "index": 0,
-      "layout": "IPY_MODEL_277a7612626b44a0b009d28fe486ab3d",
-      "style": "IPY_MODEL_6f476f6ca14a45699799d56dedbfc4a9"
-     }
-    },
-    "75a37b0df227406d87849b8ccc0e8cc8": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_9f7ceb76de8e4d9c8a6d03403fc87038",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Compute Workload\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Executed Ipc Active | inst/cycle | 0.30 |\n| Executed Ipc Elapsed | inst/cycle | 0.27 |\n| Issue Slots Busy | % | 7.55 |\n| Issued Ipc Active | inst/cycle | 0.30 |\n| SM Busy | % | 10.97 |\n\n🔧 **OPTIMIZATION**: All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.\n*Estimated Speedup (local): 96.64%*\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "77b1acef7c294c85a13b408d4763b865": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_9c1ff914f65345ddba6465fc350ffa77",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Instruction\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Avg. Executed Instructions Per Scheduler | inst | 11,908 |\n| Executed Instructions | inst | 1,905,280 |\n| Avg. Issued Instructions Per Scheduler | inst | 11,943.96 |\n| Issued Instructions | inst | 1,911,034 |\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "804c25036085410fbbcba82f19667d1d": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_face58a5cc384e4cb2b75921d967b2c5",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "# histogram_global",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       },
-       {
-        "data": {
-         "application/vnd.jupyter.widget-view+json": {
-          "model_id": "899b07bbb6304e2a9143dece489f7775",
-          "version_major": 2,
-          "version_minor": 0
-         },
-         "text/plain": "Tab(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output…"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "83f0975a18e0401d859f001422aebc74": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "899b07bbb6304e2a9143dece489f7775": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "TabModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "TabModel",
-      "_titles": {
-       "0": "Summary",
-       "1": "Speed Of Light",
-       "2": "Memory Workload",
-       "3": "Compute Workload",
-       "4": "Compute & Memory Distribution",
-       "5": "Scheduler",
-       "6": "Warp State",
-       "7": "Instruction",
-       "8": "Launch",
-       "9": "PM Sampling",
-       "10": "Occupancy",
-       "11": "Source Counters",
-       "12": "WorkloadDistribution"
-      },
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "TabView",
-      "box_style": "",
-      "children": [
-       "IPY_MODEL_b881cf3221d54ac4a68c38792d53bdf5",
-       "IPY_MODEL_0d706bed98bf41eeb56205638c01f1ec",
-       "IPY_MODEL_2a50582b630a4e63a41f10adb53303e2",
-       "IPY_MODEL_dbf4eef8f47e4789a62977505c049825",
-       "IPY_MODEL_179ce686ac1f4e60b5d34951446ad599",
-       "IPY_MODEL_4d9753b4b11f425d9476d4f69932988c",
-       "IPY_MODEL_15e004fe6f924bf5887331f0ad9da19f",
-       "IPY_MODEL_f02f5a0cfd354371b95987c862f10637",
-       "IPY_MODEL_50cc8f97b44549faa0f202361f5f7e6c",
-       "IPY_MODEL_95e56a0ec4484f06aea59796d88e0464",
-       "IPY_MODEL_39e58d26a4a84b99ad768f93a7a3a8a5",
-       "IPY_MODEL_1edda20eae6441a99d31a1105f30b8c6",
-       "IPY_MODEL_05bad554df84498daf462d395e980f52"
-      ],
-      "layout": "IPY_MODEL_af61114771e442e2bb8bc033e882dbe2",
-      "selected_index": 0
-     }
-    },
-    "8d3c55d6d03b4d79aba3d5e7b1a73f2e": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_da059452a8744262a3ebfec6543107bf",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Speed Of Light\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| DRAM Frequency | Ghz | 4.97 |\n| SM Frequency | Mhz | 584.87 |\n| Elapsed Cycles | cycle | 175,527 |\n| Memory Throughput | % | 45.20 |\n| DRAM Throughput | % | 28.01 |\n| Duration | us | 300.10 |\n| L1/TEX Cache Throughput | % | 90.41 |\n| L2 Cache Throughput | % | 6.09 |\n| SM Active Cycles | cycle | 158,097.65 |\n| Compute (SM) Throughput | % | 28.21 |\n\n🔧 **OPTIMIZATION**: This kernel exhibits low compute throughput and memory bandwidth utilization relative to the peak performance of this device. Achieved compute throughput and/or memory bandwidth below 60.0% of peak typically indicate latency issues. Look at Scheduler Statistics and Warp State Statistics for potential reasons.\n\nℹ️ **INFO**: The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline analysis.\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "95e56a0ec4484f06aea59796d88e0464": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_f47dbfc47f8647a38c370a2b7be404b2",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## PM Sampling\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Maximum Buffer Size | Mbyte | 2.10 |\n| Dropped Samples | sample | 0 |\n| Maximum Sampling Interval | cycle | 20,000 |\n| # Pass Groups |  | 1 |\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "9aee114e6f204e139cad4e9797cd9cc5": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "9c1ff914f65345ddba6465fc350ffa77": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "9f7ceb76de8e4d9c8a6d03403fc87038": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "a122e99121234ce396fa20aa6e462574": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "TabModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "TabModel",
-      "_titles": {
-       "0": "Summary",
-       "1": "Speed Of Light",
-       "2": "Memory Workload",
-       "3": "Compute Workload",
-       "4": "Compute & Memory Distribution",
-       "5": "Scheduler",
-       "6": "Warp State",
-       "7": "Instruction",
-       "8": "Launch",
-       "9": "PM Sampling",
-       "10": "Occupancy",
-       "11": "Source Counters",
-       "12": "WorkloadDistribution"
-      },
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "TabView",
-      "box_style": "",
-      "children": [
-       "IPY_MODEL_ea2f4b91760f475ebc10da01166b63b8",
-       "IPY_MODEL_8d3c55d6d03b4d79aba3d5e7b1a73f2e",
-       "IPY_MODEL_2a64bde12026429db15794fad73e7759",
-       "IPY_MODEL_75a37b0df227406d87849b8ccc0e8cc8",
-       "IPY_MODEL_0dcbea49322640a28c4f3459ea50c204",
-       "IPY_MODEL_2bf258c6b7734068a20ed1fe5bcacf61",
-       "IPY_MODEL_a130d437cec947d3ad49ab36fcfe1ef6",
-       "IPY_MODEL_77b1acef7c294c85a13b408d4763b865",
-       "IPY_MODEL_b8afefd142c641dd8733d7695d9cf14a",
-       "IPY_MODEL_6439cea566ff46e08f08022e1c0d40d6",
-       "IPY_MODEL_c799b237ce024a29b2e23307a7ff9e67",
-       "IPY_MODEL_c92de1f9425a4f2f85bb07555c3d51a1",
-       "IPY_MODEL_16d240c1461a444ea9cdbbd4a856e8f3"
-      ],
-      "layout": "IPY_MODEL_c8a33b735706409e93422519ca88fd38",
-      "selected_index": 0
-     }
-    },
-    "a130d437cec947d3ad49ab36fcfe1ef6": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_9aee114e6f204e139cad4e9797cd9cc5",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Warp State\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Warp Cycles Per Issued Instruction | cycle | 101.46 |\n| Warp Cycles Per Executed Instruction | cycle | 101.76 |\n| Avg. Active Threads Per Warp |  | 32 |\n| Avg. Not Predicated Off Threads Per Warp |  | 31.75 |\n\n🔧 **OPTIMIZATION**: On average, each warp of this kernel spends 66.8 cycles being stalled waiting for the MIO (memory input/output) instruction queue to be not full. This stall reason is high in cases of extreme utilization of the MIO pipelines, which include special math instructions, dynamic branches, as well as shared memory instructions. When caused by shared memory accesses, trying to use fewer but wider loads can reduce pipeline pressure. This stall type represents about 65.8% of the total average of 101.5 cycles between issuing two instructions.\n*Estimated Speedup (global): 54.8%*\n\nℹ️ **INFO**: Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "a21053863aa949e484f30f47bcf5c045": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DropdownModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DropdownModel",
-      "_options_labels": [
-       "histogram_global"
-      ],
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "DropdownView",
-      "description": "Kernel:",
-      "description_tooltip": null,
-      "disabled": false,
-      "index": 0,
-      "layout": "IPY_MODEL_061f8098ad9a430aad63a106b4629fb5",
-      "style": "IPY_MODEL_5e77669944a346e6b628004967e7ba75"
-     }
-    },
-    "a48f630bb3584ce294f2e79d60c4b40e": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "af61114771e442e2bb8bc033e882dbe2": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "b6bfff789d354bdd8c630826eaae63b6": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "b881cf3221d54ac4a68c38792d53bdf5": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_ea81f1518f0d47b995f50df7e74f9329",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Summary\n\n### Speed Of Light\n\n🔧 **OPTIMIZATION**: This kernel exhibits low compute throughput and memory bandwidth utilization relative to the peak performance of this device. Achieved compute throughput and/or memory bandwidth below 60.0% of peak typically indicate latency issues. Look at Scheduler Statistics and Warp State Statistics for potential reasons.\n\nℹ️ **INFO**: The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline analysis.\n\n### Memory Workload\n\n⚠️ **WARNING**: The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an additional metric could enable the rule to provide more guidance.\n\n🔧 **OPTIMIZATION**: The memory access pattern for global loads from L1TEX might not be optimal. On average, only 4.0 of the 32 bytes transmitted per sector are utilized by each thread. This could possibly be caused by a stride between threads. Check the Source Counters section for uncoalesced global loads.\n*Estimated Speedup (global): 11.55%*\n\n### Compute Workload\n\n🔧 **OPTIMIZATION**: All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.\n*Estimated Speedup (local): 99.85%*\n\n### Scheduler\n\n🔧 **OPTIMIZATION**: Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only issues an instruction every 335.6 cycles. This might leave hardware resources underutilized and may lead to less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of 7.35 active warps per scheduler, but only an average of 0.01 warps were eligible per cycle. Eligible warps are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible warp results in no instruction being issued and the issue slot remains unused. To increase the number of eligible warps, avoid possible load imbalances due to highly different execution durations per warp. Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.\n*Estimated Speedup (local): 93.4%*\n\n### Warp State\n\n🔧 **OPTIMIZATION**: On average, each warp of this kernel spends 1886.3 cycles being stalled waiting for the L1 instruction queue for local and global (LG) memory operations to be not full. Typically, this stall occurs only when executing local or global memory instructions extremely frequently. Avoid redundant global memory accesses. Try to avoid using thread-local memory by checking if dynamically indexed arrays are declared in local scope, of if the kernel has excessive register pressure causing by spills. If applicable, consider combining multiple lower-width memory operations into fewer wider memory operations and try interleaving memory operations and math instructions. This stall type represents about 76.4% of the total average of 2467.5 cycles between issuing two instructions.\n*Estimated Speedup (global): 76.45%*\n\nℹ️ **INFO**: Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.\n\n### Source Counters\n\n🔧 **OPTIMIZATION**: This kernel has uncoalesced global accesses resulting in a total of 4988724 excessive sectors (68% of the total 7329833 sectors). Check the L2 Theoretical Sectors Global Excessive table for the primary source locations. The CUDA Programming Guide (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses) has additional information on reducing uncoalesced device memory accesses.\n*Estimated Speedup (global): 6.919%*\n\n### WorkloadDistribution\n\n🔧 **OPTIMIZATION**: One or more L2 Slices have a much higher number of active cycles than the average number of active cycles. Maximum instance value is 89.83% above the average, while the minimum instance value is 60.99% below the average.\n*Estimated Speedup (global): 9.132%*\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "b8afefd142c641dd8733d7695d9cf14a": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_2ebcfbe930ca4061a86c8f1ea87b8eb0",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Launch\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Block Size |  | 512 |\n| Function Cache Configuration |  | CachePreferNone |\n| Grid Size |  | 916 |\n| Registers Per Thread | register/thread | 47 |\n| Shared Memory Configuration Size | Kbyte | 32.77 |\n| Driver Shared Memory Per Block | byte/block | 0 |\n| Dynamic Shared Memory Per Block | byte/block | 0 |\n| Static Shared Memory Per Block | Kbyte/block | 1.02 |\n| # SMs | SM | 40 |\n| Threads | thread | 468,992 |\n| Uses Green Context |  | 0 |\n| Waves Per SM |  | 11.45 |\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "bb74cb5d3eb042e0b148a8554236a950": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "c799b237ce024a29b2e23307a7ff9e67": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_f261d85898c54dc9a72eaf5bda2703ca",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Occupancy\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Block Limit SM | block | 16 |\n| Block Limit Registers | block | 2 |\n| Block Limit Shared Mem | block | 32 |\n| Block Limit Warps | block | 2 |\n| Theoretical Active Warps per SM | warp | 32 |\n| Theoretical Occupancy | % | 100 |\n| Achieved Occupancy | % | 95.92 |\n| Achieved Active Warps Per SM | warp | 30.69 |\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "c8a33b735706409e93422519ca88fd38": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "c92de1f9425a4f2f85bb07555c3d51a1": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_d7db1b0b42b948ae8c3b97b059f68cdb",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Source Counters\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Branch Instructions Ratio | % | 0.01 |\n| Branch Instructions | inst | 21,984 |\n| Branch Efficiency | % | 0 |\n| Avg. Divergent Branches |  | 0 |\n\n🔧 **OPTIMIZATION**: This kernel has uncoalesced shared accesses resulting in a total of 2154026 excessive wavefronts (82% of the total 2637674 wavefronts). Check the L1 Wavefronts Shared Excessive table for the primary source locations. The CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#shared-memory-in-matrix-multiplication-c-ab) has an example on optimizing shared memory accesses.\n*Estimated Speedup (global): 74.19%*\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "cb604331c58f43bda3a0135cc761ef22": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "d7db1b0b42b948ae8c3b97b059f68cdb": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "da059452a8744262a3ebfec6543107bf": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "dbf4eef8f47e4789a62977505c049825": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_0952fc28a1244c46befc2bed2cd7cb4c",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Compute Workload\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Executed Ipc Active | inst/cycle | 0.01 |\n| Executed Ipc Elapsed | inst/cycle | 0.01 |\n| Issue Slots Busy | % | 0.30 |\n| Issued Ipc Active | inst/cycle | 0.01 |\n| SM Busy | % | 0.37 |\n\n🔧 **OPTIMIZATION**: All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.\n*Estimated Speedup (local): 99.85%*\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "e9a4e840c12041809843a134cc29d791": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "ea2f4b91760f475ebc10da01166b63b8": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_650c0471ffea4b4bad75444a904978b1",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Summary\n\n### Speed Of Light\n\n🔧 **OPTIMIZATION**: This kernel exhibits low compute throughput and memory bandwidth utilization relative to the peak performance of this device. Achieved compute throughput and/or memory bandwidth below 60.0% of peak typically indicate latency issues. Look at Scheduler Statistics and Warp State Statistics for potential reasons.\n\nℹ️ **INFO**: The ratio of peak float (fp32) to double (fp64) performance on this device is 32:1. The kernel achieved 0% of this device's fp32 peak performance and 0% of its fp64 peak performance. See the Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#roofline) for more details on roofline analysis.\n\n### Memory Workload\n\n⚠️ **WARNING**: The optional metric lts__average_gcomp_input_sector_success_rate.pct could not be found. Collecting it as an additional metric could enable the rule to provide more guidance.\n\n### Compute Workload\n\n🔧 **OPTIMIZATION**: All compute pipelines are under-utilized. Either this kernel is very small or it doesn't issue enough warps per scheduler. Check the Launch Statistics and Scheduler Statistics sections for further details.\n*Estimated Speedup (local): 96.64%*\n\n### Scheduler\n\n🔧 **OPTIMIZATION**: Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only issues an instruction every 13.2 cycles. This might leave hardware resources underutilized and may lead to less optimal performance. Out of the maximum of 8 warps per scheduler, this kernel allocates an average of 7.66 active warps per scheduler, but only an average of 0.27 warps were eligible per cycle. Eligible warps are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible warp results in no instruction being issued and the issue slot remains unused. To increase the number of eligible warps, avoid possible load imbalances due to highly different execution durations per warp. Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.\n*Estimated Speedup (local): 54.8%*\n\n### Warp State\n\n🔧 **OPTIMIZATION**: On average, each warp of this kernel spends 66.8 cycles being stalled waiting for the MIO (memory input/output) instruction queue to be not full. This stall reason is high in cases of extreme utilization of the MIO pipelines, which include special math instructions, dynamic branches, as well as shared memory instructions. When caused by shared memory accesses, trying to use fewer but wider loads can reduce pipeline pressure. This stall type represents about 65.8% of the total average of 101.5 cycles between issuing two instructions.\n*Estimated Speedup (global): 54.8%*\n\nℹ️ **INFO**: Check the Warp Stall Sampling (All Samples) table for the top stall locations in your source based on sampling data. The Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#metrics-reference) provides more details on each stall reason.\n\n### Source Counters\n\n🔧 **OPTIMIZATION**: This kernel has uncoalesced shared accesses resulting in a total of 2154026 excessive wavefronts (82% of the total 2637674 wavefronts). Check the L1 Wavefronts Shared Excessive table for the primary source locations. The CUDA Best Practices Guide (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#shared-memory-in-matrix-multiplication-c-ab) has an example on optimizing shared memory accesses.\n*Estimated Speedup (global): 74.19%*\n\n### WorkloadDistribution\n\n🔧 **OPTIMIZATION**: One or more SMs have a much higher number of active cycles than the average number of active cycles. Maximum instance value is 8.44% above the average, while the minimum instance value is 3.29% below the average.\n*Estimated Speedup (global): 7.666%*\n\n🔧 **OPTIMIZATION**: One or more SMSPs have a much higher number of active cycles than the average number of active cycles. Maximum instance value is 8.37% above the average, while the minimum instance value is 3.25% below the average.\n*Estimated Speedup (global): 7.609%*\n\n🔧 **OPTIMIZATION**: One or more L1 Slices have a much higher number of active cycles than the average number of active cycles. Maximum instance value is 8.44% above the average, while the minimum instance value is 3.29% below the average.\n*Estimated Speedup (global): 7.666%*\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "ea81f1518f0d47b995f50df7e74f9329": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "f02f5a0cfd354371b95987c862f10637": {
-     "model_module": "@jupyter-widgets/output",
-     "model_module_version": "1.0.0",
-     "model_name": "OutputModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/output",
-      "_model_module_version": "1.0.0",
-      "_model_name": "OutputModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/output",
-      "_view_module_version": "1.0.0",
-      "_view_name": "OutputView",
-      "layout": "IPY_MODEL_1006aafeb85f4ae9b1cabe28f33184f4",
-      "msg_id": "",
-      "outputs": [
-       {
-        "data": {
-         "text/markdown": "## Instruction\n\n| Metric Name | Metric Unit | Metric Value |\n|-------------|-------------|--------------|\n| Avg. Executed Instructions Per Scheduler | inst | 14,289.60 |\n| Executed Instructions | inst | 2,286,336 |\n| Avg. Issued Instructions Per Scheduler | inst | 14,322.95 |\n| Issued Instructions | inst | 2,291,672 |\n",
-         "text/plain": "<IPython.core.display.Markdown object>"
-        },
-        "metadata": {},
-        "output_type": "display_data"
-       }
-      ]
-     }
-    },
-    "f1930ef855f844ee947a1c9b07ed95a4": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "f261d85898c54dc9a72eaf5bda2703ca": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "f2ef2b3576a54144a9faadd4ea42f0f4": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "f440961da49a4e6c9185ae5f55373df3": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "f47dbfc47f8647a38c370a2b7be404b2": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "face58a5cc384e4cb2b75921d967b2c5": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "fd8251af0c4743e29b89a76a73a27af4": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    }
-   }
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/accelerated-python/notebooks/kernels/solutions/43__kernel_authoring__black_and_white__SOLUTION.ipynb b/tutorials/accelerated-python/notebooks/kernels/solutions/43__kernel_authoring__black_and_white__SOLUTION.ipynb
index 3eabe764..5529e105 100644
--- a/tutorials/accelerated-python/notebooks/kernels/solutions/43__kernel_authoring__black_and_white__SOLUTION.ipynb
+++ b/tutorials/accelerated-python/notebooks/kernels/solutions/43__kernel_authoring__black_and_white__SOLUTION.ipynb
@@ -211,7 +211,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/accelerated-python/notebooks/libraries/20__cudf__nyc_parking_violations.ipynb b/tutorials/accelerated-python/notebooks/libraries/20__cudf__nyc_parking_violations.ipynb
index 6a810ac3..8a8438d6 100644
--- a/tutorials/accelerated-python/notebooks/libraries/20__cudf__nyc_parking_violations.ipynb
+++ b/tutorials/accelerated-python/notebooks/libraries/20__cudf__nyc_parking_violations.ipynb
@@ -1,655 +1,673 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "# DataFrames with Pandas and cuDF"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Table of Contents\n",
-        "\n",
-        "1. [Introduction](#1.-Introduction)\n",
-        "2. [Pandas Essentials](#2.-Pandas-Essentials)\n",
-        "   - [2.1 Series and DataFrame Objects](#2.1-Series-and-DataFrame-Objects)\n",
-        "   - [2.2 Selecting and Filtering Data](#2.2-Selecting-and-Filtering-Data)\n",
-        "   - [2.3 Sorting](#2.3-Sorting)\n",
-        "   - [2.4 Summarizing Data](#2.4-Summarizing-Data)\n",
-        "   - [2.5 Grouped Aggregations (groupby)](#2.5-Grouped-Aggregations-(groupby))\n",
-        "   - [2.6 String Operations](#2.6-String-Operations)\n",
-        "   - [2.7 Time Series](#2.7-Time-Series)\n",
-        "   - [2.8 User-Defined Operations (apply)](#2.8-User-Defined-Operations-(apply))\n",
-        "3. [Enter cuDF: GPU DataFrames](#3.-Enter-cuDF:-GPU-DataFrames)\n",
-        "   - [3.1 Exercise: Date Formatting Failure](#3.1-Exercise:-Date-Formatting-Failure)\n",
-        "   - [3.2 Exercise: Why `.apply()` Breaks Down in cuDF](#3.2-Exercise:-Why-`.apply()`-Breaks-Down-in-cuDF)\n",
-        "4. [Exercise: Analyzing Real Data (NYC Parking Violations)](#4.-Exercise:-Analyzing-Real-Data-(NYC-Parking-Violations))\n",
-        "   - [Step 0: Download Data](#Step-0:-Download-Data)\n",
-        "   - [Task 1: Data Inspection (Pandas)](#📝-Task-1:-Data-Inspection-(Pandas))\n",
-        "   - [Task 2: Analyze Taxis (Pandas)](#📝-Task-2:-Analyze-Taxis-(Pandas))\n",
-        "   - [Task 3: GPU Acceleration (cuDF)](#📝-Task-3:-GPU-Acceleration-(cuDF))\n",
-        "5. [Conclusion](#Conclusion)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## 1. Introduction\n",
-        "\n",
-        "In this notebook, we will build a foundation in data manipulation using **Pandas**, a popular tool for Python data analysis. Then, we will transition to **cuDF**, which allows us to run standard Pandas-like code on the GPU.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Learning Objectives:\n",
-        "\n",
-        "- **Introduce core Pandas operations:** Indexing, Filtering, Aggregating, and Time Series.\n",
-        "- **Learn the subtle differences** (and speed benefits) when porting code to cuDF.\n",
-        "- **Exercise:** Apply these skills to analyze a real-world NYC Parking Violations dataset.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "**Note:** Throughout this notebook, we provide \"Quick Docs\" sections to remind you of common syntax. However, these are not exhaustive. For complete API details, parameters, and edge cases, you should always reference the official [Pandas Documentation](https://pandas.pydata.org/docs/) or the [cuDF Documentation](https://docs.rapids.ai/api/cudf/stable/).\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "---\n",
-        "\n",
-        "## 2. Pandas Essentials\n",
-        "\n",
-        "Before we accelerate with GPUs, let's ensure we are comfortable with the DataFrame API. Even if you are a Pandas pro, this refresher sets the baseline syntax we will replicate later.\n",
-        "\n",
-        "First, import the library:\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "import pandas as pd\n",
-        "import numpy as np"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### 2.1 Series and DataFrame Objects\n",
-        "\n",
-        "- **Series:** A one-dimensional labeled array (like a powerful list or a single column).\n",
-        "- **DataFrame:** A two-dimensional labeled data structure (like a spreadsheet or SQL table).\n",
-        "\n",
-        "**Quick Docs:**\n",
-        "\n",
-        "- `pd.Series(data)`: Create a Series.\n",
-        "- `pd.DataFrame(data, index)`: Create a DataFrame.\n",
-        "- `df.head(n)` / `df.tail(n)`: View the first/last n rows.\n",
-        "- `df.index` / `df.columns`: Access row labels and column names.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# A Series acts like a single column of data\n",
-        "s = pd.Series([10, 20, 30])\n",
-        "print(f\"Max value in series: {s.max()}\")\n",
-        "\n",
-        "# A DataFrame is a collection of Series sharing an index\n",
-        "df = pd.DataFrame({\n",
-        "    \"a\": [1, 2, 1, 3, 2],\n",
-        "    \"b\": [1, 4, 7, 2, 0],\n",
-        "    \"c\": [3, 3, 3, 4, 5]\n",
-        "}, index=[1, 2, 3, 4, 5])\n",
-        "\n",
-        "# View the structure\n",
-        "print(\"Columns:\", df.columns)\n",
-        "df.head(3)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### 2.2 Selecting and Filtering Data\n",
-        "\n",
-        "Selecting specific subsets of data is the most common task in analysis. You can select by column name, label index, or integer position.\n",
-        "\n",
-        "**Quick Docs:**\n",
-        "\n",
-        "- `df['col']`: Select a single column (returns a Series).\n",
-        "- `df[['col1', 'col2']]`: Select multiple columns (returns a DataFrame).\n",
-        "- `df.loc[label]`: Select row(s) by index label.\n",
-        "- `df.iloc[position]`: Select row(s) by integer position (0-based).\n",
-        "- `df[condition]`: Boolean indexing (filtering).\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Select specific columns\n",
-        "subset = df[[\"b\", \"c\"]]\n",
-        "\n",
-        "# Select rows by label (loc) and position (iloc)\n",
-        "row_label_2 = df.loc[2]     # Row with index label 2\n",
-        "row_pos_0 = df.iloc[0]      # First row (physically)\n",
-        "\n",
-        "# Boolean Indexing: Filter rows where column 'a' is greater than 1\n",
-        "filtered_df = df[df['a'] > 1]\n",
-        "filtered_df"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### 2.3 Sorting\n",
-        "\n",
-        "Ordering data helps in ranking and visualization.\n",
-        "\n",
-        "**Quick Docs:**\n",
-        "\n",
-        "- `df.sort_values(by='col', ascending=True/False)`: Sort by one or more columns.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Sort by column 'a' in ascending order\n",
-        "sorted_df = df.sort_values(\"a\")\n",
-        "sorted_df"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### 2.4 Summarizing Data\n",
-        "\n",
-        "It's straightforward to get a quick overview of your data's distribution.\n",
-        "\n",
-        "**Quick Docs:**\n",
-        "\n",
-        "- `df.describe()`: Summary statistics (count, mean, std, etc.).\n",
-        "- `df.mean()`, `df.sum()`, `df.max()`: Aggregations across columns.\n",
-        "- `df['col'].value_counts()`: Count unique values (useful for histograms).\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Calculate the sum of every column\n",
-        "print(\"Sum of columns:\\n\", df.sum())\n",
-        "\n",
-        "# Count frequency of values in column 'a'\n",
-        "print(\"\\nValue counts for 'a':\\n\", df[\"a\"].value_counts())"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### 2.5 Grouped Aggregations (groupby)\n",
-        "\n",
-        "The \"Split-Apply-Combine\" strategy. Split data into groups based on some criteria, apply a function to each group, and combine the results.\n",
-        "\n",
-        "**Quick Docs:**\n",
-        "\n",
-        "- `df.groupby('col')`: Group data.\n",
-        "- `.mean()`, `.count()`: Apply aggregation.\n",
-        "- `.agg({'col': ['min', 'max']})`: Apply complex, specific aggregations.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Group by 'a' and calculate the mean of 'b' and 'c' for each group\n",
-        "grouped_mean = df.groupby(\"a\").mean()\n",
-        "print(grouped_mean)\n",
-        "\n",
-        "# Complex aggregation: Get min and mean of 'b', and max of 'c'\n",
-        "agg_df = df.groupby(\"a\").agg({\n",
-        "    \"b\": [\"min\", \"mean\"],\n",
-        "    \"c\": [\"max\"]\n",
-        "})\n",
-        "agg_df"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### 2.6 String Operations\n",
-        "\n",
-        "Pandas provides vectorized string functions via the `.str` accessor.\n",
-        "\n",
-        "**Quick Docs:**\n",
-        "\n",
-        "- `df['col'].str.upper()`: Convert to uppercase.\n",
-        "- `df['col'].str.contains('pattern')`: Boolean check for substring.\n",
-        "- `df['col'].str.replace('old', 'new')`: Replace text."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Add a string column\n",
-        "df[\"names\"] = [\"mario\", \"luigi\", \"yoshi\", \"peach\", \"toad\"]\n",
-        "\n",
-        "# Convert to uppercase\n",
-        "df[\"names_upper\"] = df[\"names\"].str.upper()\n",
-        "df"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### 2.7 Time Series\n",
-        "\n",
-        "Pandas was originally developed for financial time series analysis. It handles dates and times robustly via the `.dt` accessor.\n",
-        "\n",
-        "**Quick Docs:**\n",
-        "\n",
-        "- `pd.to_datetime()`: Convert strings to datetime objects.\n",
-        "- `df['date'].dt.year`: Extract year component.\n",
-        "- `df['date'].dt.dayofweek`: Extract day of week.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Create a date range\n",
-        "date_df = pd.DataFrame()\n",
-        "date_df[\"date\"] = pd.date_range(\"2018-11-20\", periods=5, freq=\"D\")\n",
-        "date_df[\"value\"] = np.random.sample(len(date_df))\n",
-        "\n",
-        "# Filter by date\n",
-        "subset_dates = date_df[date_df[\"date\"] < \"2018-11-23\"]\n",
-        "\n",
-        "# Extract features\n",
-        "date_df[\"year\"] = date_df[\"date\"].dt.year\n",
-        "date_df"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### 2.8 User-Defined Operations (apply)\n",
-        "\n",
-        "When built-in functions aren't enough, you can apply custom Python functions.\n",
-        "\n",
-        "**Quick Docs:**\n",
-        "\n",
-        "- `df['col'].apply(func)`: Apply function `func` to every element.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "def add_ten(x):\n",
-        "    return x + 10\n",
-        "\n",
-        "# Apply the custom function\n",
-        "df[\"a_plus_10\"] = df[\"a\"].apply(add_ten)\n",
-        "df.head(2)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "---\n",
-        "\n",
-        "## 3. Enter cuDF: GPU DataFrames\n",
-        "\n",
-        "cuDF mimics the Pandas API but runs on the GPU. The transition is often as simple as changing the import, but there are some constraints you must know.\n",
-        "\n",
-        "First, let's create a GPU DataFrame.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "import cudf\n",
-        "\n",
-        "# Create a cuDF DataFrame (data resides on GPU)\n",
-        "gdf = cudf.DataFrame({\n",
-        "    \"a\": [1, 2, 1, 3, 2],\n",
-        "    \"b\": [1, 4, 7, 2, 0],\n",
-        "    \"c\": [1, 1, 8, 2, 9]\n",
-        "}, index=[1, 2, 3, 4, 5])\n",
-        "\n",
-        "# Operations work exactly the same!\n",
-        "print(type(gdf))\n",
-        "gdf.groupby(\"a\").mean()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### 3.1 Exercise: Date Formatting Failure\n",
-        "\n",
-        "Pandas is very forgiving with date formats. cuDF is stricter. Run the cell below to see what happens when you use a non-standard date string.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# EXECUTE THIS CELL TO SEE THE ERROR\n",
-        "try:\n",
-        "    date_df = cudf.DataFrame()\n",
-        "    # Pandas handles \"11/20/2018\" easily. Does cuDF?\n",
-        "    date_df[\"date\"] = cudf.date_range(\"11/20/2018\", periods=72, freq=\"D\")\n",
-        "except Exception as e:\n",
-        "    print(f\"Error caught: {e}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "**Why did this fail?** Unlike Pandas, cuDF currently requires ISO-standard date formats (Year-Month-Day) for creating date ranges.\n",
-        "\n",
-        "- **Pandas:** Guesses `11/20/2018` is Nov 20th.\n",
-        "- **cuDF:** Requires `2018-11-20`.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### 3.2 Exercise: Why `.apply()` Breaks Down in cuDF\n",
-        "\n",
-        "In Pandas, `.apply()` works by executing Python-level code over the data, which makes it very flexible but CPU-centric. On the GPU, this model does not translate directly: GPUs cannot execute arbitrary Python bytecode. In cuDF, `.apply()`-style user-defined functions must be JIT-compiled with Numba for the CUDA target, which imposes constraints:\n",
-        "- The function must be Numba-compilable (no general Python objects or unsupported features).\n",
-        "- Only operations that Numba can lower to GPU device code are allowed.\n",
-        "- Code must follow a more restricted, compilation-friendly execution model.\n",
-        "\n",
-        "Note that cuDF UDFs are not limited to pure math. Some string operations and well-defined null-handling patterns are supported. See the [cuDF apply() documentation](https://docs.rapids.ai/api/cudf/latest/user_guide/api_docs/api/cudf.dataframe.apply/) for the full set of supported features and limitations.\n",
-        "\n",
-        "Even simple-looking Python functions often fall outside these constraints:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 1,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# A function that looks innocent but is NOT GPU-safe\n",
-        "def add_ten_verbose(x):\n",
-        "    # Python branching + dynamic typing make this un-compilable for the GPU\n",
-        "    if isinstance(x, (int, float)):\n",
-        "        return x + 10\n",
-        "    else:\n",
-        "        return x"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "This is perfectly legal in Pandas. But in cuDF, Numba cannot:\n",
-        "\n",
-        "- interpret `isinstance`\n",
-        "- handle Python branching on object types\n",
-        "- JIT-compile dynamic return values\n",
-        "\n",
-        "Now try running it:\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Execute this cell to observe the cuDF limitation\n",
-        "try:\n",
-        "    gdf[\"a\"] = gdf[\"a\"].apply(add_ten_verbose)\n",
-        "except Exception as e:\n",
-        "    print(\"cuDF apply() constraint caught:\")\n",
-        "    print(e)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "**What happens?** cuDF attempts to compile the function → compilation fails → you get a runtime error. This mirrors real-world failure modes: anything that is not pure numerical logic will break.\n",
-        "\n",
-        "Here is the same logic, rewritten in a way the GPU can compile:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# GPU-safe version: no Python, no branching, pure math\n",
-        "def add_ten_gpu(x):\n",
-        "    return x + 10\n",
-        "try:\n",
-        "    gdf[\"a\"] = gdf[\"a\"].apply(add_ten_gpu)\n",
-        "except Exception as e:\n",
-        "    print(\"cuDF apply() constraint caught:\")\n",
-        "    print(e)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "This is one of the few forms that Numba can translate. But even this version is not preferred. **cuDF `.apply()` is a last resort**. Even if your function compiles, `.apply()` still triggers:\n",
-        "\n",
-        "- JIT compilation overhead (slow startup)\n",
-        "- Kernel launch overhead\n",
-        "- Reduced optimization compared to built-in GPU operations\n",
-        "\n",
-        "For typical column transformations, this is simply unnecessary. \n",
-        "**Best practice is to always use vectorized operations:**\n",
-        "\n",
-        "```python\n",
-        "gdf[\"a\"] + 10\n",
-        "```\n",
-        "The vectorized version is:\n",
-        "- faster\n",
-        "- simpler\n",
-        "- more readable\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "---\n",
-        "\n",
-        "## 4. Exercise: Analyzing Real Data (NYC Parking Violations)\n",
-        "\n",
-        "Now you will apply what you learned to a large, real-world dataset.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Step 0: Download Data\n",
-        "\n",
-        "We will fetch a subset of the NYC Parking Violations dataset (Fiscal Year 2022).\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "![ -f nyc_parking_violations_2022.parquet ] || curl -fsSL -o nyc_parking_violations_2022.parquet https://data.rapids.ai/datasets/nyc_parking/nyc_parking_violations_2022.parquet\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Task 1: Data Inspection (Pandas)\n",
-        "\n",
-        "**Goal:** Load the data and inspect its structure.\n",
-        "\n",
-        "**Instructions:**\n",
-        "\n",
-        "1. Read the file `nyc_parking_violations_2022.parquet` into a Pandas DataFrame.\n",
-        "2. Print the columns.\n",
-        "3. Create a subset DataFrame with only: `Registration State`, `Violation Description`, `Vehicle Body Type`, `Issue Date`.\n",
-        "4. Display the head of this subset.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "import pandas as pd\n",
-        "\n",
-        "# TODO: Read parquet file\n",
-        "# df = ...\n",
-        "\n",
-        "# TODO: Print columns\n",
-        "# ...\n",
-        "\n",
-        "# TODO: Select specific columns\n",
-        "# df_subset = ...\n",
-        "\n",
-        "# TODO: Display head\n",
-        "# ..."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Task 2: Analyze Taxis (Pandas)\n",
-        "\n",
-        "**Goal:** Filter, Group, and Count.\n",
-        "\n",
-        "**Instructions:**\n",
-        "\n",
-        "1. Filter the DataFrame to find rows where `Vehicle Body Type` is `\"TAXI\"`.\n",
-        "2. Group by `Registration State`.\n",
-        "3. Count the occurrences to see which states the taxis are registered in.\n",
-        "4. Sort the results descending to find the top states.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# TODO: Filter for TAXI\n",
-        "# taxi_df = ...\n",
-        "\n",
-        "# TODO: Group by State and count\n",
-        "# ...\n",
-        "\n",
-        "# TODO: Sort and display top results\n",
-        "# ..."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Task 3: GPU Acceleration (cuDF)\n",
-        "\n",
-        "**Goal:** Measure the speedup.\n",
-        "\n",
-        "**Instructions:**\n",
-        "\n",
-        "1. Import `cudf`.\n",
-        "2. Use `%%time` at the top of the cell.\n",
-        "3. Replicate the entire pipeline (Read -> Filter columns -> Filter Rows -> Group -> Sort) using `cudf`.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "%%time\n",
-        "import cudf\n",
-        "\n",
-        "# TODO: Replicate the analysis using cuDF\n",
-        "# ..."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "---\n",
-        "\n",
-        "## Conclusion\n",
-        "\n",
-        "Compare the **Wall time** of Task 2 vs Task 3. You should see a significant performance improvement with cuDF, especially as data size grows!\n"
-      ]
-    }
-  ],
-  "metadata": {
-    "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
-      "language": "python",
-      "name": "python3"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 2
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# DataFrames with Pandas and cuDF"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Table of Contents\n",
+    "\n",
+    "1. [Introduction](#1.-Introduction)\n",
+    "2. [Pandas Essentials](#2.-Pandas-Essentials)\n",
+    "   - [2.1 Series and DataFrame Objects](#2.1-Series-and-DataFrame-Objects)\n",
+    "   - [2.2 Selecting and Filtering Data](#2.2-Selecting-and-Filtering-Data)\n",
+    "   - [2.3 Sorting](#2.3-Sorting)\n",
+    "   - [2.4 Summarizing Data](#2.4-Summarizing-Data)\n",
+    "   - [2.5 Grouped Aggregations (groupby)](#2.5-Grouped-Aggregations-(groupby))\n",
+    "   - [2.6 String Operations](#2.6-String-Operations)\n",
+    "   - [2.7 Time Series](#2.7-Time-Series)\n",
+    "   - [2.8 User-Defined Operations (apply)](#2.8-User-Defined-Operations-(apply))\n",
+    "3. [Enter cuDF: GPU DataFrames](#3.-Enter-cuDF:-GPU-DataFrames)\n",
+    "   - [3.1 Exercise: Date Formatting Failure](#3.1-Exercise:-Date-Formatting-Failure)\n",
+    "   - [3.2 Exercise: Why `.apply()` Breaks Down in cuDF](#3.2-Exercise:-Why-`.apply()`-Breaks-Down-in-cuDF)\n",
+    "4. [Exercise: Analyzing Real Data (NYC Parking Violations)](#4.-Exercise:-Analyzing-Real-Data-(NYC-Parking-Violations))\n",
+    "   - [Step 0: Download Data](#Step-0:-Download-Data)\n",
+    "   - [Task 1: Data Inspection (Pandas)](#📝-Task-1:-Data-Inspection-(Pandas))\n",
+    "   - [Task 2: Analyze Taxis (Pandas)](#📝-Task-2:-Analyze-Taxis-(Pandas))\n",
+    "   - [Task 3: GPU Acceleration (cuDF)](#📝-Task-3:-GPU-Acceleration-(cuDF))\n",
+    "5. [Conclusion](#Conclusion)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Introduction\n",
+    "\n",
+    "In this notebook, we will build a foundation in data manipulation using **Pandas**, a popular tool for Python data analysis. Then, we will transition to **cuDF**, which allows us to run standard Pandas-like code on the GPU.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Learning Objectives:\n",
+    "\n",
+    "- **Introduce core Pandas operations:** Indexing, Filtering, Aggregating, and Time Series.\n",
+    "- **Learn the subtle differences** (and speed benefits) when porting code to cuDF.\n",
+    "- **Exercise:** Apply these skills to analyze a real-world NYC Parking Violations dataset.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Note:** Throughout this notebook, we provide \"Quick Docs\" sections to remind you of common syntax. However, these are not exhaustive. For complete API details, parameters, and edge cases, you should always reference the official [Pandas Documentation](https://pandas.pydata.org/docs/) or the [cuDF Documentation](https://docs.rapids.ai/api/cudf/stable/).\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "## 2. Pandas Essentials\n",
+    "\n",
+    "Before we accelerate with GPUs, let's ensure we are comfortable with the DataFrame API. Even if you are a Pandas pro, this refresher sets the baseline syntax we will replicate later.\n",
+    "\n",
+    "First, import the library:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.1 Series and DataFrame Objects\n",
+    "\n",
+    "- **Series:** A one-dimensional labeled array (like a powerful list or a single column).\n",
+    "- **DataFrame:** A two-dimensional labeled data structure (like a spreadsheet or SQL table).\n",
+    "\n",
+    "**Quick Docs:**\n",
+    "\n",
+    "- `pd.Series(data)`: Create a Series.\n",
+    "- `pd.DataFrame(data, index)`: Create a DataFrame.\n",
+    "- `df.head(n)` / `df.tail(n)`: View the first/last n rows.\n",
+    "- `df.index` / `df.columns`: Access row labels and column names.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# A Series acts like a single column of data\n",
+    "s = pd.Series([10, 20, 30])\n",
+    "print(f\"Max value in series: {s.max()}\")\n",
+    "\n",
+    "# A DataFrame is a collection of Series sharing an index\n",
+    "df = pd.DataFrame({\n",
+    "    \"a\": [1, 2, 1, 3, 2],\n",
+    "    \"b\": [1, 4, 7, 2, 0],\n",
+    "    \"c\": [3, 3, 3, 4, 5]\n",
+    "}, index=[1, 2, 3, 4, 5])\n",
+    "\n",
+    "# View the structure\n",
+    "print(\"Columns:\", df.columns)\n",
+    "df.head(3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.2 Selecting and Filtering Data\n",
+    "\n",
+    "Selecting specific subsets of data is the most common task in analysis. You can select by column name, label index, or integer position.\n",
+    "\n",
+    "**Quick Docs:**\n",
+    "\n",
+    "- `df['col']`: Select a single column (returns a Series).\n",
+    "- `df[['col1', 'col2']]`: Select multiple columns (returns a DataFrame).\n",
+    "- `df.loc[label]`: Select row(s) by index label.\n",
+    "- `df.iloc[position]`: Select row(s) by integer position (0-based).\n",
+    "- `df[condition]`: Boolean indexing (filtering).\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Select specific columns\n",
+    "subset = df[[\"b\", \"c\"]]\n",
+    "\n",
+    "# Select rows by label (loc) and position (iloc)\n",
+    "row_label_2 = df.loc[2]     # Row with index label 2\n",
+    "row_pos_0 = df.iloc[0]      # First row (physically)\n",
+    "\n",
+    "# Boolean Indexing: Filter rows where column 'a' is greater than 1\n",
+    "filtered_df = df[df['a'] > 1]\n",
+    "filtered_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.3 Sorting\n",
+    "\n",
+    "Ordering data helps in ranking and visualization.\n",
+    "\n",
+    "**Quick Docs:**\n",
+    "\n",
+    "- `df.sort_values(by='col', ascending=True/False)`: Sort by one or more columns.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Sort by column 'a' in ascending order\n",
+    "sorted_df = df.sort_values(\"a\")\n",
+    "sorted_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.4 Summarizing Data\n",
+    "\n",
+    "It's straightforward to get a quick overview of your data's distribution.\n",
+    "\n",
+    "**Quick Docs:**\n",
+    "\n",
+    "- `df.describe()`: Summary statistics (count, mean, std, etc.).\n",
+    "- `df.mean()`, `df.sum()`, `df.max()`: Aggregations across columns.\n",
+    "- `df['col'].value_counts()`: Count unique values (useful for histograms).\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Calculate the sum of every column\n",
+    "print(\"Sum of columns:\\n\", df.sum())\n",
+    "\n",
+    "# Count frequency of values in column 'a'\n",
+    "print(\"\\nValue counts for 'a':\\n\", df[\"a\"].value_counts())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.5 Grouped Aggregations (groupby)\n",
+    "\n",
+    "The \"Split-Apply-Combine\" strategy. Split data into groups based on some criteria, apply a function to each group, and combine the results.\n",
+    "\n",
+    "**Quick Docs:**\n",
+    "\n",
+    "- `df.groupby('col')`: Group data.\n",
+    "- `.mean()`, `.count()`: Apply aggregation.\n",
+    "- `.agg({'col': ['min', 'max']})`: Apply complex, specific aggregations.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Group by 'a' and calculate the mean of 'b' and 'c' for each group\n",
+    "grouped_mean = df.groupby(\"a\").mean()\n",
+    "print(grouped_mean)\n",
+    "\n",
+    "# Complex aggregation: Get min and mean of 'b', and max of 'c'\n",
+    "agg_df = df.groupby(\"a\").agg({\n",
+    "    \"b\": [\"min\", \"mean\"],\n",
+    "    \"c\": [\"max\"]\n",
+    "})\n",
+    "agg_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.6 String Operations\n",
+    "\n",
+    "Pandas provides vectorized string functions via the `.str` accessor.\n",
+    "\n",
+    "**Quick Docs:**\n",
+    "\n",
+    "- `df['col'].str.upper()`: Convert to uppercase.\n",
+    "- `df['col'].str.contains('pattern')`: Boolean check for substring.\n",
+    "- `df['col'].str.replace('old', 'new')`: Replace text."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add a string column\n",
+    "df[\"names\"] = [\"mario\", \"luigi\", \"yoshi\", \"peach\", \"toad\"]\n",
+    "\n",
+    "# Convert to uppercase\n",
+    "df[\"names_upper\"] = df[\"names\"].str.upper()\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.7 Time Series\n",
+    "\n",
+    "Pandas was originally developed for financial time series analysis. It handles dates and times robustly via the `.dt` accessor.\n",
+    "\n",
+    "**Quick Docs:**\n",
+    "\n",
+    "- `pd.to_datetime()`: Convert strings to datetime objects.\n",
+    "- `df['date'].dt.year`: Extract year component.\n",
+    "- `df['date'].dt.dayofweek`: Extract day of week.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a date range\n",
+    "date_df = pd.DataFrame()\n",
+    "date_df[\"date\"] = pd.date_range(\"2018-11-20\", periods=5, freq=\"D\")\n",
+    "date_df[\"value\"] = np.random.sample(len(date_df))\n",
+    "\n",
+    "# Filter by date\n",
+    "subset_dates = date_df[date_df[\"date\"] < \"2018-11-23\"]\n",
+    "\n",
+    "# Extract features\n",
+    "date_df[\"year\"] = date_df[\"date\"].dt.year\n",
+    "date_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.8 User-Defined Operations (apply)\n",
+    "\n",
+    "When built-in functions aren't enough, you can apply custom Python functions.\n",
+    "\n",
+    "**Quick Docs:**\n",
+    "\n",
+    "- `df['col'].apply(func)`: Apply function `func` to every element.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def add_ten(x):\n",
+    "    return x + 10\n",
+    "\n",
+    "# Apply the custom function\n",
+    "df[\"a_plus_10\"] = df[\"a\"].apply(add_ten)\n",
+    "df.head(2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "## 3. Enter cuDF: GPU DataFrames\n",
+    "\n",
+    "cuDF mimics the Pandas API but runs on the GPU. The transition is often as simple as changing the import, but there are some constraints you must know.\n",
+    "\n",
+    "First, let's create a GPU DataFrame.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudf\n",
+    "\n",
+    "# Create a cuDF DataFrame (data resides on GPU)\n",
+    "gdf = cudf.DataFrame({\n",
+    "    \"a\": [1, 2, 1, 3, 2],\n",
+    "    \"b\": [1, 4, 7, 2, 0],\n",
+    "    \"c\": [1, 1, 8, 2, 9]\n",
+    "}, index=[1, 2, 3, 4, 5])\n",
+    "\n",
+    "# Operations work exactly the same!\n",
+    "print(type(gdf))\n",
+    "gdf.groupby(\"a\").mean()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.1 Exercise: Date Formatting Failure\n",
+    "\n",
+    "Pandas is very forgiving with date formats. cuDF is stricter. Run the cell below to see what happens when you use a non-standard date string.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# EXECUTE THIS CELL TO SEE THE ERROR\n",
+    "try:\n",
+    "    date_df = cudf.DataFrame()\n",
+    "    # Pandas handles \"11/20/2018\" easily. Does cuDF?\n",
+    "    date_df[\"date\"] = cudf.date_range(\"11/20/2018\", periods=72, freq=\"D\")\n",
+    "except Exception as e:\n",
+    "    print(f\"Error caught: {e}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Why did this fail?** Unlike Pandas, cuDF currently requires ISO-standard date formats (Year-Month-Day) for creating date ranges.\n",
+    "\n",
+    "- **Pandas:** Guesses `11/20/2018` is Nov 20th.\n",
+    "- **cuDF:** Requires `2018-11-20`.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.2 Exercise: Why `.apply()` Breaks Down in cuDF\n",
+    "\n",
+    "In Pandas, `.apply()` works by executing Python-level code over the data, which makes it very flexible but CPU-centric. On the GPU, this model does not translate directly: GPUs cannot execute arbitrary Python bytecode. In cuDF, `.apply()`-style user-defined functions must be JIT-compiled with Numba for the CUDA target, which imposes constraints:\n",
+    "- The function must be Numba-compilable (no general Python objects or unsupported features).\n",
+    "- Only operations that Numba can lower to GPU device code are allowed.\n",
+    "- Code must follow a more restricted, compilation-friendly execution model.\n",
+    "\n",
+    "Note that cuDF UDFs are not limited to pure math. Some string operations and well-defined null-handling patterns are supported. See the [cuDF apply() documentation](https://docs.rapids.ai/api/cudf/latest/user_guide/api_docs/api/cudf.dataframe.apply/) for the full set of supported features and limitations.\n",
+    "\n",
+    "Even simple-looking Python functions often fall outside these constraints:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# A function that looks innocent but is NOT GPU-safe\n",
+    "def add_ten_verbose(x):\n",
+    "    # Python branching + dynamic typing make this un-compilable for the GPU\n",
+    "    if isinstance(x, (int, float)):\n",
+    "        return x + 10\n",
+    "    else:\n",
+    "        return x"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is perfectly legal in Pandas. But in cuDF, Numba cannot:\n",
+    "\n",
+    "- interpret `isinstance`\n",
+    "- handle Python branching on object types\n",
+    "- JIT-compile dynamic return values\n",
+    "\n",
+    "Now try running it:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Execute this cell to observe the cuDF limitation\n",
+    "try:\n",
+    "    gdf[\"a\"] = gdf[\"a\"].apply(add_ten_verbose)\n",
+    "except Exception as e:\n",
+    "    print(\"cuDF apply() constraint caught:\")\n",
+    "    print(e)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**What happens?** cuDF attempts to compile the function → compilation fails → you get a runtime error. This mirrors real-world failure modes: anything that is not pure numerical logic will break.\n",
+    "\n",
+    "Here is the same logic, rewritten in a way the GPU can compile:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# GPU-safe version: no Python, no branching, pure math\n",
+    "def add_ten_gpu(x):\n",
+    "    return x + 10\n",
+    "try:\n",
+    "    gdf[\"a\"] = gdf[\"a\"].apply(add_ten_gpu)\n",
+    "except Exception as e:\n",
+    "    print(\"cuDF apply() constraint caught:\")\n",
+    "    print(e)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is one of the few forms that Numba can translate. But even this version is not preferred. **cuDF `.apply()` is a last resort**. Even if your function compiles, `.apply()` still triggers:\n",
+    "\n",
+    "- JIT compilation overhead (slow startup)\n",
+    "- Kernel launch overhead\n",
+    "- Reduced optimization compared to built-in GPU operations\n",
+    "\n",
+    "For typical column transformations, this is simply unnecessary. \n",
+    "**Best practice is to always use vectorized operations:**\n",
+    "\n",
+    "```python\n",
+    "gdf[\"a\"] + 10\n",
+    "```\n",
+    "The vectorized version is:\n",
+    "- faster\n",
+    "- simpler\n",
+    "- more readable\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "## 4. Exercise: Analyzing Real Data (NYC Parking Violations)\n",
+    "\n",
+    "Now you will apply what you learned to a large, real-world dataset.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 0: Download Data\n",
+    "\n",
+    "We will fetch a subset of the NYC Parking Violations dataset (Fiscal Year 2022).\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "![ -f nyc_parking_violations_2022.parquet ] || curl -fsSL -o nyc_parking_violations_2022.parquet https://data.rapids.ai/datasets/nyc_parking/nyc_parking_violations_2022.parquet\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Task 1: Data Inspection (Pandas)\n",
+    "\n",
+    "**Goal:** Load the data and inspect its structure.\n",
+    "\n",
+    "**Instructions:**\n",
+    "\n",
+    "1. Read the file `nyc_parking_violations_2022.parquet` into a Pandas DataFrame.\n",
+    "2. Print the columns.\n",
+    "3. Create a subset DataFrame with only: `Registration State`, `Violation Description`, `Vehicle Body Type`, `Issue Date`.\n",
+    "4. Display the head of this subset.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# TODO: Read parquet file\n",
+    "# df = ...\n",
+    "\n",
+    "# TODO: Print columns\n",
+    "# ...\n",
+    "\n",
+    "# TODO: Select specific columns\n",
+    "# df_subset = ...\n",
+    "\n",
+    "# TODO: Display head\n",
+    "# ..."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Task 2: Analyze Taxis (Pandas)\n",
+    "\n",
+    "**Goal:** Filter, Group, and Count.\n",
+    "\n",
+    "**Instructions:**\n",
+    "\n",
+    "1. Filter the DataFrame to find rows where `Vehicle Body Type` is `\"TAXI\"`.\n",
+    "2. Group by `Registration State`.\n",
+    "3. Count the occurrences to see which states the taxis are registered in.\n",
+    "4. Sort the results descending to find the top states.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO: Filter for TAXI\n",
+    "# taxi_df = ...\n",
+    "\n",
+    "# TODO: Group by State and count\n",
+    "# ...\n",
+    "\n",
+    "# TODO: Sort and display top results\n",
+    "# ..."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Task 3: GPU Acceleration (cuDF)\n",
+    "\n",
+    "**Goal:** Measure the speedup.\n",
+    "\n",
+    "**Instructions:**\n",
+    "\n",
+    "1. Import `cudf`.\n",
+    "2. Use `%%time` at the top of the cell.\n",
+    "3. Replicate the entire pipeline (Read -> Filter columns -> Filter Rows -> Group -> Sort) using `cudf`.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "import cudf\n",
+    "\n",
+    "# TODO: Replicate the analysis using cuDF\n",
+    "# ..."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "## Conclusion\n",
+    "\n",
+    "Compare the **Wall time** of Task 2 vs Task 3. You should see a significant performance improvement with cuDF, especially as data size grows!\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
 }
diff --git a/tutorials/accelerated-python/notebooks/libraries/21__cudf_pandas__nyc_parking_violations.ipynb b/tutorials/accelerated-python/notebooks/libraries/21__cudf_pandas__nyc_parking_violations.ipynb
index f8b96c1a..fb08a89c 100644
--- a/tutorials/accelerated-python/notebooks/libraries/21__cudf_pandas__nyc_parking_violations.ipynb
+++ b/tutorials/accelerated-python/notebooks/libraries/21__cudf_pandas__nyc_parking_violations.ipynb
@@ -261,6 +261,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (RAPIDS 25.10)",
    "language": "python",
@@ -276,11 +282,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  },
-  "colab": {
-   "provenance": [],
-   "gpuType": "T4"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/accelerated-python/notebooks/libraries/22__cuml.ipynb b/tutorials/accelerated-python/notebooks/libraries/22__cuml.ipynb
index c33262ff..8ccc649b 100644
--- a/tutorials/accelerated-python/notebooks/libraries/22__cuml.ipynb
+++ b/tutorials/accelerated-python/notebooks/libraries/22__cuml.ipynb
@@ -198,7 +198,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.7"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/accelerated-python/notebooks/libraries/23__cuda_cccl__customizing_algorithms.ipynb b/tutorials/accelerated-python/notebooks/libraries/23__cuda_cccl__customizing_algorithms.ipynb
index bf211c8f..3ce54127 100644
--- a/tutorials/accelerated-python/notebooks/libraries/23__cuda_cccl__customizing_algorithms.ipynb
+++ b/tutorials/accelerated-python/notebooks/libraries/23__cuda_cccl__customizing_algorithms.ipynb
@@ -1,1465 +1,1479 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "id": "42b6145d",
-      "metadata": {
-        "editable": true,
-        "slideshow": {
-          "slide_type": ""
-        },
-        "tags": []
-      },
-      "source": [
-        "## Table of Contents\n",
-        "\n",
-        "1. [Introduction: CCCL & Customizing Algorithms](#1.-Introduction:-CCCL-&-Customizing-Algorithms)\n",
-        "   - 1.1 [What is cuda-cccl?](#1.1-What-is-cuda-cccl?)\n",
-        "   - 1.2 [When to use it?](#1.2-When-to-use-it?)\n",
-        "   - 1.3 [Installation](#1.3-Installation)\n",
-        "2. [Basic Reductions](#2.-Basic-Reductions)\n",
-        "   - 2.1 [Using `reduce_into()` to compute the sum of a sequence](#2.1-Using-`reduce_into()`-to-compute-the-sum-of-a-sequence)\n",
-        "   - 2.2 [Exercise: computing the minimum value](#2.2-Exercise:-computing-the-minimum-value)\n",
-        "3. [Custom Reductions](#3.-Custom-Reductions)\n",
-        "   - 3.1 [Example: sum of even values](#3.1-Example:-sum-of-even-values)\n",
-        "   - 3.2 [Performance Comparison: Custom vs. Naive CuPy](#3.2-Performance-Comparison:-Custom-vs.-Naive-CuPy)\n",
-        "4. [Scanning Algorithms](#4.-Scanning-Algorithms)\n",
-        "   - 4.1 [What is a Scan?](#4.1-What-is-a-Scan?)\n",
-        "   - 4.2 [Maximum Scan Example](#4.2-Maximum-Scan-Example)\n",
-        "5. [Sorting Algorithms](#5.-Sorting-Algorithms)\n",
-        "   - 5.1 [Merge Sort](#5.1-Merge-Sort)\n",
-        "     - 5.1.1 [Exercise: sort by the last digit](#5.1.1-Exercise:-sort-by-the-last-digit)\n",
-        "   - 5.2 [Radix Sort](#5.2-Radix-Sort)\n",
-        "6. [Transformation Algorithms](#6.-Transformation-Algorithms)\n",
-        "   - 6.1 [Unary and Binary Transform](#6.1-Unary-and-Binary-Transform)\n",
-        "   - 6.2 [Data Normalization with Transform](#6.2-Data-Normalization-with-Transform)\n",
-        "   - 6.3 [Transform with Iterators for Memory Efficiency](#6.3-Transform-with-Iterators-for-Memory-Efficiency)\n",
-        "7. [Custom (Struct) Data Types](#7.-Custom-(Struct)-Data-Types)\n",
-        "8. [Working with Iterators](#8.-Working-with-Iterators)\n",
-        "   - 8.1 [CountingIterators and ConstantIterator](#8.1-CountingIterators-and-ConstantIterator)\n",
-        "   - 8.2 [TransformIterator](#8.2-TransformIterator)\n",
-        "   - 8.3 [ZipIterator](#8.3-ZipIterator)\n",
-        "9. [Capstone Exercise: Implementing Running Average](#9.-Capstone-Exercise:-Implementing-Running-Average)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "575a8a12",
-      "metadata": {},
-      "source": [
-        "# 1. Introduction: CCCL & Customizing Algorithms\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "008c05bf-5615-4b0b-8294-d66795f1f155",
-      "metadata": {},
-      "source": [
-        "## 1.1 What is cuda-cccl?"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "98385a37-45b5-4b42-8a30-22112d2292df",
-      "metadata": {},
-      "source": [
-        "The [CUDA Core Compute Libraries (CCCL)](https://nvidia.github.io/cccl/unstable/python/) provide high-quality, high-performance abstractions for CUDA development in Python. The `cuda-cccl` Python package is composed of two indepdendent subpackages:\n",
-        "\n",
-        "* `cuda.compute` is a **parallel algorithms library** containing algorithms like `reduce`, `transform`, `scan` and `sort`. These  can be combined to implement more complex algorithms, while delivering the performance of hand-optimized CUDA kernels, portable across different GPU architectures. They are general-purpose and **designed to be used with CuPy, PyTorch and other array/tensor frameworks.**.\n",
-        "\n",
-        "* `cuda.coop` is a lower-level library containing **cooperative algorithms meant to be used within (numba) CUDA kernels**.  Examples include _block-wide reduction_ and _warp-wide scan_, providing numba CUDA kernel developers with building blocks to create speed-of-light, custom kernels."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "79acd5e0-640c-4b61-8528-f57904d0ca95",
-      "metadata": {},
-      "source": [
-        "## 1.2 When to use it?"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "14f05b64-f709-41de-8899-add8b4c3dcb3",
-      "metadata": {},
-      "source": [
-        "`cccl` provides a level of abstraction in between tensor libraries and raw CUDA kernels.\n",
-        "\n",
-        "- If you want to implement custom functionality that can not easily and efficiently be expressed using PyTorch/CuPy operations, you can reach for `cuda.compute` before resorting to writing CUDA kernels.\n",
-        "- If you _do_ need to write a kernel, you can often make use of the block-level and warp-level primitives offered by `cuda.coop` to write your kernel much more efficiently and concisely.  "
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "3e2b1188-ce5e-4a10-84a1-ed6ace92922f",
-      "metadata": {},
-      "source": [
-        "<img src=\"images/cccl-spectrum.png\" width=\"1000\">"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "71178c05",
-      "metadata": {
-        "editable": true,
-        "slideshow": {
-          "slide_type": ""
-        },
-        "tags": []
-      },
-      "source": [
-        "## 1.3 Installation"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "fc78704b-cc16-4f3a-8a76-a1ecb4acd8e3",
-      "metadata": {
-        "editable": true,
-        "slideshow": {
-          "slide_type": ""
-        },
-        "tags": []
-      },
-      "source": [
-        "The command below installs `cuda-cccl` along with pieces of the CUDA toolkit it needs. You'll only need to do this in Google Colab."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "23c21a66-dcf9-44f5-8262-78aa77754523",
-      "metadata": {
-        "editable": true,
-        "slideshow": {
-          "slide_type": ""
-        },
-        "tags": []
-      },
-      "outputs": [],
-      "source": [
-        "import os\n",
-        "\n",
-        "if os.getenv(\"COLAB_RELEASE_TAG\") and not os.path.exists(\"/ach-installed\"): # If running in Google Colab:\n",
-        "  !pip uninstall \"cuda-python\" --yes > /dev/null\n",
-        "  !pip install \"numba-cuda\" \"cuda-cccl[test-cu12]\" > /dev/null 2>&1\n",
-        "  open(\"/ach-installed\", \"a\").close()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "10355920-9bfc-4788-bfe7-ab99440a6d98",
-      "metadata": {},
-      "source": [
-        "The `[test-cu12]` extras installs CuPy, which we will use in our examples. It is not strictly a dependency of `cuda-cccl` - you can use any array-like object (like PyTorch tensors) as well."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "bca5a026-db41-4079-808e-b8c3c7196c7a",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "import numpy as np\n",
-        "import cupy  as cp\n",
-        "import cuda.compute as comp"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "01908d54",
-      "metadata": {
-        "editable": true,
-        "slideshow": {
-          "slide_type": ""
-        },
-        "tags": []
-      },
-      "source": [
-        "# 2. Basic Reductions"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "2d468957-1411-4cae-a050-a75692452cc2",
-      "metadata": {},
-      "source": [
-        "A **reduction** takes many values and combines them into a single result using a binary operation.\n",
-        "\n",
-        "As a simple example, consider a sequence of values like $[2, 3, 5, 1, 7, 6, 8, 4]$. The *sum* of the values of that sequence is a reduction using _addition_ as the binary operation: $(2 + 3 + 5 + 1 + 7 + 6 + 8 + 4) = 36$. Similarly, the *maximum value* can be obtained by performing a reduction using `max(a, b)` as the binary operation."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "83065c06-aeab-4d26-9d18-69edb8462c2d",
-      "metadata": {},
-      "source": [
-        "A reduction can be computed in parallel. Typically this is done using a \"tree\" reduction where elements are combined in pairs across multiple levels, resembling the structure of a binary tree. At each level, the number of elements is halved as partial results are computed in parallel. This continues until a single final result is obtained at the root of the tree.\n",
-        "\n",
-        "<img src=\"https://upload.wikimedia.org/wikipedia/commons/e/ee/Binomial_tree.gif\" width=\"600\">\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "01d0feb1-7ed0-42dc-bb83-8b017d89a2a6",
-      "metadata": {},
-      "source": [
-        "If you know some CUDA, you can quite easily write a kernel to implement this kind of parallel reduction. However, optimizing it for the specific CUDA architecture of your device, and generalizing for different data types and sizes can be difficult.\n",
-        "\n",
-        "This is where `cuda.compute` comes in. It provides optimized implementations of algorithms like reduction that give the best possible performance."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "23cb8fb8-8e35-4179-8a66-2c3b5e6077ae",
-      "metadata": {},
-      "source": [
-        "## 2.1 Using `reduce_into()` to compute the sum of a sequence"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "9c8f6367-d7f9-4030-bc57-4b8920299b47",
-      "metadata": {},
-      "source": [
-        "`cuda.compute` provides a `reduce_into()` function to compute general reductions:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "adce5791",
-      "metadata": {
-        "editable": true,
-        "slideshow": {
-          "slide_type": ""
-        },
-        "tags": []
-      },
-      "outputs": [],
-      "source": [
-        "\"\"\"\n",
-        "Using `reduce_into()` to compute the sum of a sequence\n",
-        "\"\"\"\n",
-        "\n",
-        "# Prepare the inputs and outputs.\n",
-        "d_input = cp.array([2, 3, 5, 1, 7, 6, 8, 4], dtype=np.int32)  # input sequence, a CuPy (device) array\n",
-        "d_output = cp.empty(1, dtype=np.int32)  # array which will hold the result, a CuPy (device) array of size 1\n",
-        "h_init = np.array([0], dtype=np.int32)  # initial value of the reduction, a NumPy (host) array of size 1\n",
-        "\n",
-        "# Perform the reduction.\n",
-        "comp.reduce_into(d_input, d_output, comp.OpKind.PLUS, len(d_input), h_init)\n",
-        "\n",
-        "print(d_input)\n",
-        "# Verify the result.\n",
-        "expected_output = 36\n",
-        "assert (d_output == expected_output).all()\n",
-        "result = d_output[0]\n",
-        "print(f\"Sum reduction result: {result}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "2f00121b-bfb1-4b37-9651-230386d9c256",
-      "metadata": {
-        "editable": true,
-        "slideshow": {
-          "slide_type": ""
-        },
-        "tags": []
-      },
-      "source": [
-        "## 2.2 Exercise: computing the minimum value"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "8f5e9ffc-89cd-4859-97ea-93abbb8b3f4b",
-      "metadata": {},
-      "source": [
-        "`reduce_into()` can be used to compute other reductions "
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "9241706a-b152-41b3-bba7-d281c9e43675",
-      "metadata": {},
-      "source": [
-        "Similar to the examples above, below is an incomplete code snippet for computing the minimum value of a sequence. Complete the section between the comments `begin TODO` and `end TODO` to use `reduce_into()` to compute the minimum."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "44cb3daf-1a82-4af3-965e-b0d4be56b17e",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "\"\"\"\n",
-        "Using `reduce_into()` to compute the minimum value of a sequence\n",
-        "\"\"\"\n",
-        "\n",
-        "d_input = cp.array([-2, 3, 5, 1, 7, -6, 8, -4], dtype=np.int32)\n",
-        "d_output = cp.empty(1, dtype=np.int32)\n",
-        "\n",
-        "# begin TODO\n",
-        "\n",
-        "\n",
-        "# end TODO\n",
-        "\n",
-        "expected_output = -6\n",
-        "assert (d_output == expected_output).all()\n",
-        "result = d_output[0]\n",
-        "print(f\"Min reduction result: {result}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "9839aacd-7256-484c-8205-e48068f3217b",
-      "metadata": {},
-      "source": [
-        "# 3. Custom Reductions"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "9b74da96-38ff-434a-87fa-bba49e37bf5a",
-      "metadata": {
-        "editable": true,
-        "slideshow": {
-          "slide_type": ""
-        },
-        "tags": []
-      },
-      "source": [
-        "## 3.1 Example: sum of even values"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "a0395884-0786-4b75-9442-046609041439",
-      "metadata": {},
-      "source": [
-        "At this point, you might be thinking:"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "f09ef69c-07d1-4ca7-af22-89f105d7c532",
-      "metadata": {},
-      "source": [
-        "> **_Umm, can't I just use CuPy or PyTorch to compute sum or max?_**"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "cd42b854-3576-4cf8-9880-f1aed514a10b",
-      "metadata": {},
-      "source": [
-        "Of course, given a CuPy array, it's trivial to do simple reductions like `sum`, `min` or `max`:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "61248d58-dcb7-4bf2-9b8b-9669ea2cd3d6",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "d_input = cp.array([-2, 3, 5, 1, 7, -6, 8, -4], dtype=np.int32)\n",
-        "\n",
-        "print(f\"Sum using cp.sum: {cp.sum(d_input)}\")\n",
-        "print(f\"Max value using cp.max: {cp.max(d_input)}\")\n",
-        "print(f\"Min value using cp.min: {cp.min(d_input)}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "b96e0729-87d3-4423-ac13-28c5d34e3786",
-      "metadata": {},
-      "source": [
-        "The benefit of `cuda-cccl` is more apparent when you want to do custom operations. For example, rather than just computing a straightforward `sum`, let's say we wanted to compute the sum of **only even values** in a sequence. Naively, here's how to do that with CuPy:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "929b31ba-ce77-4fcd-9750-5efa0c13c346",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "d_input = cp.array([2, 3, 5, 1, 7, 6, 8, 4], dtype=np.int32)\n",
-        "result = (d_input[d_input % 2 == 0]).sum()\n",
-        "print(f\"Sum of even values with CuPy: {result}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "48fd15a3-f616-4337-a7d7-c81c999a73f7",
-      "metadata": {},
-      "source": [
-        "Now, let's do the same thing with `parallel`:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "03947ed8-c6f9-49ae-b382-db7a0ef14931",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "\"\"\"\n",
-        "Using `reduce_into()` with a custom binary operation\n",
-        "\"\"\"\n",
-        "\n",
-        "# Define a custom binary operation for the reduction.\n",
-        "def sum_even_op(a, b):\n",
-        "    return (a if a % 2 == 0 else 0) + (b if b % 2 == 0 else 0)\n",
-        "\n",
-        "d_input = cp.array([2, 3, 5, 1, 7, 6, 8, 4], dtype=np.int32)\n",
-        "d_output = cp.empty(1, dtype=np.int32)\n",
-        "h_init = np.array([0], dtype=np.int32)\n",
-        "\n",
-        "# Call `reduce_into()` passing the function above for the binary operation:\n",
-        "comp.reduce_into(d_input, d_output, sum_even_op, len(d_input), h_init)\n",
-        "result = d_output.get()[0]\n",
-        "print(f\"Sum of even values with `cuda.compute`: {result}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "e7c9205d-7668-4a72-a01d-f57e2d1a3bb1",
-      "metadata": {},
-      "source": [
-        "We got the same result using `cuda.compute`, but we had to write significantly more code. Is it worth it? Below is a small benchmarking script comparing timings for a range of input sizes:"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "43afe1e9-c870-43e0-b884-2c7f93581869",
-      "metadata": {},
-      "source": [
-        "## 3.2 Performance Comparison: Custom vs. Naive CuPy"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "729a1867-3a21-4f09-af97-f02d0a562d9f",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "\"\"\"\n",
-        "Compare the performance of the `parallel` implementation with a naive CuPy implementation\n",
-        "\"\"\"\n",
-        "\n",
-        "import timeit\n",
-        "\n",
-        "def evens_sum_cupy(d_input, d_output, h_init):\n",
-        "    # ignore h_init\n",
-        "    cp.sum(d_input[d_input % 2 == 0], out=d_output[0])\n",
-        "\n",
-        "def evens_sum_cccl(d_input, d_output, h_init):\n",
-        "    # note, using `op` as the binary operation, rather than `OpKind.PLUS`:\n",
-        "    comp.reduce_into(d_input, d_output, sum_even_op, len(d_input), h_init)\n",
-        "\n",
-        "def time_gpu_func(f, *args, **kwargs):\n",
-        "    cp.cuda.Device().synchronize()\n",
-        "    t1 = timeit.default_timer()\n",
-        "    n = 1_000\n",
-        "    for i in range(n):\n",
-        "        f(*args, **kwargs)\n",
-        "        cp.cuda.Device().synchronize()\n",
-        "    t2 = timeit.default_timer()\n",
-        "    return t2 - t1\n",
-        "\n",
-        "sizes = [10_000, 100_000, 1_000_000, 10_000_000, 100_000_000]\n",
-        "cccl_times = []\n",
-        "cp_times = []\n",
-        "\n",
-        "for n in sizes:\n",
-        "    d_input = cp.random.randint(low=0, high=10, size=n, dtype=np.int32)\n",
-        "    d_out = cp.empty(1, dtype=np.int32)\n",
-        "    h_init = np.array([0], dtype=np.int32)\n",
-        "\n",
-        "    cccl_times.append(time_gpu_func(evens_sum_cccl, d_input, d_out, h_init))\n",
-        "    cp_times.append(time_gpu_func(evens_sum_cupy, d_input, d_out, h_init))\n",
-        "\n",
-        "import matplotlib.pyplot as plt\n",
-        "\n",
-        "# Plotting\n",
-        "fig = plt.figure(figsize=(10, 5))\n",
-        "plt.loglog(sizes, cccl_times, marker='o', label='cuda.ccl')\n",
-        "plt.loglog(sizes, cp_times, marker='s', label='CuPy')\n",
-        "\n",
-        "# Annotate each cuda.ccl point with speedup vs CuPy\n",
-        "for x, t_cccl, t_cp in zip(sizes, cccl_times, cp_times):\n",
-        "    speedup = t_cp / t_cccl\n",
-        "    label = f\"{speedup:.1f}x faster\"\n",
-        "    plt.annotate(label,\n",
-        "                 (x, t_cccl),\n",
-        "                 textcoords=\"offset points\",\n",
-        "                 xytext=(5, -10),  # offset position\n",
-        "                 ha='left',\n",
-        "                 fontsize=9,\n",
-        "                 color='green')\n",
-        "\n",
-        "# Labels and title\n",
-        "plt.xlabel('Input Size')\n",
-        "plt.ylabel('Time (seconds)')\n",
-        "plt.title('Timing Comparison for evens_sum.')\n",
-        "plt.legend()\n",
-        "plt.grid(True)\n",
-        "plt.tight_layout()\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "a0a0d085-75ab-4836-afb6-72ec1abd1d6a",
-      "metadata": {},
-      "source": [
-        "We see that using `cuda.compute` is much faster than our naive CuPy approach. This is because:\n",
-        "\n",
-        "* Operator fusion: the CuPy operation `x[x % 2 == 0]).sum()` is actually 4 separate operations (and at least 4 separate CUDA kernel invocations). With `cuda.compute`, we have a single call to `reduce_into()` that does all the computation.\n",
-        "* No intermediate memory allocations.\n",
-        "* Lesser Python overhead: `cuda.compute` is a lower-level library. You don't have to jump through multiple layers of Python before invoking device code."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "a987e11b-777b-4da3-81f8-5808c0dd8836",
-      "metadata": {
-        "editable": true,
-        "slideshow": {
-          "slide_type": ""
-        },
-        "tags": []
-      },
-      "source": [
-        "# 4. Scanning Algorithms\n",
-        "\n",
-        "## 4.1 What is a Scan?\n",
-        "\n",
-        "A **scan** (also called prefix sum) computes a running total of elements. For each position, it shows the cumulative result up to that point.\n",
-        "\n",
-        "**Two types of scans:**\n",
-        "* **Inclusive scan**: Includes the current element in the sum\n",
-        "* **Exclusive scan**: Excludes the current element (shifts results)\n",
-        "\n",
-        "**Visual example:**\n",
-        "\n",
-        "```\n",
-        "Input:     [3, 1, 4, 1, 5]\n",
-        "Inclusive: [3, 4, 8, 9, 14]  (3, 3+1, 3+1+4, 3+1+4+1, 3+1+4+1+5)\n",
-        "Exclusive: [0, 3, 4, 8, 9]   (0, 3, 3+1, 3+1+4, 3+1+4+1)\n",
-        "```"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "fca0df7f",
-      "metadata": {
-        "editable": true,
-        "slideshow": {
-          "slide_type": ""
-        },
-        "tags": []
-      },
-      "outputs": [],
-      "source": [
-        "d_input = cp.array([3, 1, 4, 1, 5, 9, 2, 6], dtype=np.int32)\n",
-        "d_inclusive = cp.empty_like(d_input)\n",
-        "d_exclusive = cp.empty_like(d_input)\n",
-        "h_init = np.array([0], dtype=np.int32)\n",
-        "\n",
-        "def add_op(a, b):\n",
-        "    return a + b\n",
-        "\n",
-        "comp.inclusive_scan(d_input, d_inclusive, add_op, h_init, len(d_input))\n",
-        "comp.exclusive_scan(d_input, d_exclusive, add_op, h_init, len(d_input))\n",
-        "\n",
-        "print(f\"Input:           {d_input.get()}\")\n",
-        "print(f\"Inclusive scan:  {d_inclusive.get()}\")\n",
-        "print(f\"Exclusive scan:  {d_exclusive.get()}\")\n",
-        "\n",
-        "# Verify with NumPy\n",
-        "np_inclusive = np.cumsum(d_input.get())\n",
-        "np_exclusive = np.concatenate([[0], np_inclusive[:-1]])\n",
-        "np.testing.assert_allclose(d_inclusive.get(), np_inclusive)\n",
-        "np.testing.assert_allclose(d_exclusive.get(), np_exclusive)\n",
-        "print(f\"NumPy inclusive:    {np_inclusive}\")\n",
-        "print(f\"NumPy exclusive:    {np_exclusive}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "d5c4b010",
-      "metadata": {},
-      "source": [
-        "## 4.2 Maximum Scan Example\n",
-        "\n",
-        "Scans aren't limited to addition. Here's an example using maximum operation to find running maximum.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "088f30c8",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Running maximum example\n",
-        "d_input = cp.array([3, 7, 2, 9, 1, 8, 4, 6], dtype=np.int32)\n",
-        "d_output = cp.empty_like(d_input)\n",
-        "\n",
-        "def max_op(a, b):\n",
-        "    return a if a > b else b\n",
-        "\n",
-        "# Start with a very small value\n",
-        "h_init = np.array([-999999], dtype=np.int32)\n",
-        "\n",
-        "# Perform inclusive scan with max operation\n",
-        "comp.inclusive_scan(d_input, d_output, max_op, h_init, len(d_input))\n",
-        "\n",
-        "print(f\"Input:       {d_input.get()}\")\n",
-        "print(f\"Running max: {d_output.get()}\")\n",
-        "\n",
-        "# Verify with NumPy\n",
-        "np_running_max = np.maximum.accumulate(d_input.get())\n",
-        "print(f\"NumPy max:   {np_running_max}\")\n",
-        "print(f\"Match:       {np.array_equal(d_output.get(), np_running_max)}\")\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "5e37c3f0-b7cb-45e6-83bf-2bbbeeab74b2",
-      "metadata": {},
-      "source": [
-        "# 5. Sorting Algorithms"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "7aaa94ff",
-      "metadata": {},
-      "source": [
-        "## 5.1 Merge Sort"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "e5c4043e-6dec-44f4-ab6b-15f24d0c3cfb",
-      "metadata": {},
-      "source": [
-        "The `merge_sort` function can be used to perform key-value sorting."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "a893a898-e74b-4ea2-b69a-4432a342fead",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Prepare the input arrays.\n",
-        "d_in_keys = cp.asarray([-5, 0, 2, -3, 2, -3, 0, -3, -5, 2], dtype=\"int32\")\n",
-        "d_in_values = cp.asarray(\n",
-        "    [-3.2, 2.2, 1.9, 4.0, -3.9, 2.7, 0, 8.3 - 1, 2.9, 5.4], dtype=\"float32\"\n",
-        ")\n",
-        "\n",
-        "# Perform the merge sort.\n",
-        "comp.merge_sort(\n",
-        "    d_in_keys,\n",
-        "    d_in_values,\n",
-        "    d_in_keys,  # reuse input array to store output\n",
-        "    d_in_values,  # reuse input array to store output\n",
-        "    comp.OpKind.LESS,\n",
-        "    d_in_keys.size,\n",
-        ")\n",
-        "\n",
-        "print(f\"Sorted keys: {d_in_keys.get()}\")\n",
-        "print(f\"Sorted values: {d_in_values.get()}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "13dafff6-7a05-43d4-bedb-1c6cc0027573",
-      "metadata": {},
-      "source": [
-        "If you just want to sort keys (with no corresponding values), just pass `None`:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "2e90bc0c-810e-46d9-9ef9-9f3adaeecd10",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Prepare the input and output arrays.\n",
-        "d_in_keys = cp.asarray([-5, 0, 2, -3, 2, -3, 0, -3, -5, 2], dtype=\"int32\")\n",
-        "\n",
-        "print(d_in_keys)\n",
-        "\n",
-        "# Perform the merge sort.\n",
-        "comp.merge_sort(\n",
-        "    d_in_keys,\n",
-        "    None,  # don't specify a values array\n",
-        "    d_in_keys,  # reuse input array to store output\n",
-        "    None,  # don't specify a values array\n",
-        "    comp.OpKind.LESS,\n",
-        "    d_in_keys.size,\n",
-        ")\n",
-        "\n",
-        "print(f\"Sorted keys: {d_in_keys.get()}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "6c6ffd71-90d1-461f-843b-96aca2990206",
-      "metadata": {
-        "editable": true,
-        "slideshow": {
-          "slide_type": ""
-        },
-        "tags": []
-      },
-      "source": [
-        "### 5.1.1 Exercise: sort by the last digit"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "3b91fe2a-169e-4ee6-8408-ee87cf49b481",
-      "metadata": {
-        "editable": true,
-        "slideshow": {
-          "slide_type": ""
-        },
-        "tags": []
-      },
-      "source": [
-        "In this excercise, you'll use `merge_sort` with a custom comparator function to sort elements by the last digit.\n",
-        "For example, $[29, 9, 136, 1001, 72, 24, 32, 1] \\rightarrow [1001, 1, 72, 32, 24, 136, 29, 9]$."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "88e95ff4-013a-4ca9-bdf4-5e122ba0f66a",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Prepare the input and output arrays.\n",
-        "d_in_keys = cp.asarray([29, 9, 136, 1001, 72, 24, 32, 1], dtype=\"int32\")\n",
-        "\n",
-        "# define the custom comparator.\n",
-        "def comparison_op(lhs, rhs):\n",
-        "    # begin TODO\n",
-        "\n",
-        "    # end TODO\n",
-        "\n",
-        "# Perform the merge sort.\n",
-        "comp.merge_sort(\n",
-        "    # begin TODO\n",
-        "\n",
-        "    # end TODO\n",
-        ")\n",
-        "\n",
-        "print(f\"Result: {d_in_keys}\")\n",
-        "expected = np.asarray([1001, 1, 72, 32, 24, 136, 29, 9], dtype=np.int32)\n",
-        "assert (d_in_keys.get() == expected).all()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "b0b45660-9d80-43a9-9b7b-22eaeed7df4f",
-      "metadata": {},
-      "source": [
-        "## 5.2 Radix Sort\n",
-        "\n",
-        "The `radix_sort` function provides fast sorting for numeric types using the radix sort algorithm. Unlike merge sort, radix sort doesn't use comparisons but instead processes the bits/digits of numbers."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "7f61f655",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Basic radix sort example (ascending order)\n",
-        "d_input = cp.array([64, 34, 25, 12, 22, 11, 90, 5, 77, 30], dtype=np.int32)\n",
-        "d_output = cp.empty_like(d_input)\n",
-        "\n",
-        "print(f\"Input:  {d_input.get()}\")\n",
-        "\n",
-        "# Sort in ascending order\n",
-        "comp.radix_sort(\n",
-        "    d_input,                           # Input keys\n",
-        "    d_output,                          # Output keys\n",
-        "    None,                              # Input values (none for keys-only sort)\n",
-        "    None,                              # Output values (none)\n",
-        "    comp.SortOrder.ASCENDING,          # Sort order\n",
-        "    len(d_input)                       # Number of elements\n",
-        ")\n",
-        "\n",
-        "print(f\"Sorted: {d_output.get()}\")\n",
-        "\n",
-        "# Verify sorting\n",
-        "is_sorted = all(d_output.get()[i] <= d_output.get()[i+1] for i in range(len(d_output.get())-1))\n",
-        "print(f\"Properly sorted: {is_sorted}\")\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "bdaa70a2",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Descending order sort\n",
-        "d_input = cp.array([3, 1, 4, 1, 5, 9, 2, 6, 5, 3], dtype=np.int32)\n",
-        "d_output = cp.empty_like(d_input)\n",
-        "\n",
-        "comp.radix_sort(\n",
-        "    d_input, d_output, None, None,\n",
-        "    comp.SortOrder.DESCENDING,    # Sort in reverse order\n",
-        "    len(d_input)\n",
-        ")\n",
-        "\n",
-        "print(f\"Input:            {d_input.get()}\")\n",
-        "print(f\"Descending sort:  {d_output.get()}\")\n",
-        "\n",
-        "# Verify descending order\n",
-        "is_descending = all(d_output.get()[i] >= d_output.get()[i+1] for i in range(len(d_output.get())-1))\n",
-        "print(f\"Properly descending: {is_descending}\")\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "17ce2740",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Key-value sorting: sort scores while keeping student IDs aligned\n",
-        "scores = [85, 92, 78, 96, 88, 71, 94]\n",
-        "student_ids = [101, 102, 103, 104, 105, 106, 107]\n",
-        "\n",
-        "d_keys = cp.array(scores, dtype=np.int32)\n",
-        "d_values = cp.array(student_ids, dtype=np.int32)\n",
-        "d_keys_out = cp.empty_like(d_keys)\n",
-        "d_values_out = cp.empty_like(d_values)\n",
-        "\n",
-        "print(\"Before sorting:\")\n",
-        "for score, student_id in zip(scores, student_ids):\n",
-        "    print(f\"  Student {student_id}: {score}\")\n",
-        "\n",
-        "# Sort by scores (highest first), keep student IDs aligned\n",
-        "comp.radix_sort(\n",
-        "    d_keys, d_keys_out,                # Input/output keys (scores)\n",
-        "    d_values, d_values_out,            # Input/output values (student IDs)\n",
-        "    comp.SortOrder.DESCENDING,    # Highest scores first\n",
-        "    len(d_keys)\n",
-        ")\n",
-        "\n",
-        "sorted_scores = d_keys_out.get()\n",
-        "sorted_ids = d_values_out.get()\n",
-        "\n",
-        "print(\"\\nAfter sorting (by score, highest first):\")\n",
-        "for score, student_id in zip(sorted_scores, sorted_ids):\n",
-        "    print(f\"  Student {student_id}: {score}\")\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "423fd14a",
-      "metadata": {},
-      "source": [
-        "# 6. Transformation Algorithms\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "5c001e63-54ab-4943-83be-7503fea3ae0a",
-      "metadata": {},
-      "source": [
-        "## 6.1 Unary and Binary Transform\n",
-        "\n",
-        "### Unary transform"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "b55614c6-6865-4bb8-941d-ae73290c7a91",
-      "metadata": {},
-      "source": [
-        "The `unary_transform` function applies a user-provided unary operation to each element of the input."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "1b45d79d-87a3-47c2-9681-8810535d3a80",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Prepare the input and output arrays.\n",
-        "d_in = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32)\n",
-        "d_out = cp.empty_like(d_in)\n",
-        "\n",
-        "def double_op(a):\n",
-        "    return a * 2\n",
-        "\n",
-        "# Perform the unary transform.\n",
-        "comp.unary_transform(d_in, d_out, double_op, len(d_in))\n",
-        "print(f\"Result of unary transform: {d_out.get()}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "4e413c39-444b-43f8-b5e5-bd1adc18417c",
-      "metadata": {},
-      "source": [
-        "### Binary transform"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "fa210a46-0c41-4c3f-94bd-954bc6add058",
-      "metadata": {},
-      "source": [
-        "The `binary_transform` function applies a user-provided binary operation to pairs of elements from two inputs."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "346192c1-2318-437f-aaf5-c8e5f3b21d1e",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Prepare the input and output arrays.\n",
-        "d_in1 = cp.asarray([2, 8, 9, 6, 3], dtype=np.int32)\n",
-        "d_in2 = cp.asarray([7, 2, 1, 0, -1], dtype=np.int32)\n",
-        "d_out = cp.empty_like(d_in1)\n",
-        "\n",
-        "# Perform the binary transform.\n",
-        "comp.binary_transform(d_in1, d_in2, d_out, comp.OpKind.PLUS, len(d_in1))\n",
-        "print(f\"Result of binary transform: {d_out.get()}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "cd76102c-c085-4893-b047-5c22eeabb20b",
-      "metadata": {},
-      "source": [
-        "## 6.2 Data Normalization with Transform\n",
-        "\n",
-        "Transform operations are commonly used in machine learning for data preprocessing, such as normalizing features to have zero mean and unit variance."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "5fd56eac",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Example: Normalize house prices for machine learning\n",
-        "house_prices = np.array([250000, 180000, 320000, 420000, 150000, 380000, 220000, 295000], dtype=np.float32)\n",
-        "d_prices = cp.array(house_prices)\n",
-        "d_normalized = cp.empty_like(d_prices)\n",
-        "\n",
-        "# Calculate statistics for normalization\n",
-        "price_mean = float(np.mean(house_prices))\n",
-        "price_std = float(np.std(house_prices))\n",
-        "\n",
-        "print(f\"Original prices: {house_prices}\")\n",
-        "print(f\"Mean: ${price_mean:,.0f}, Std: ${price_std:,.0f}\")\n",
-        "\n",
-        "def z_score_normalize(price):\n",
-        "    \"\"\"Z-score normalization: (x - mean) / std\"\"\"\n",
-        "    return (price - price_mean) / price_std\n",
-        "\n",
-        "# Apply normalization transformation\n",
-        "comp.unary_transform(d_prices, d_normalized, z_score_normalize, len(house_prices))\n",
-        "\n",
-        "normalized_result = d_normalized.get()\n",
-        "print(f\"Normalized prices: {normalized_result}\")\n",
-        "print(f\"Normalized mean: {np.mean(normalized_result):.6f}\")\n",
-        "print(f\"Normalized std: {np.std(normalized_result):.6f}\")\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "17282ee7",
-      "metadata": {
-        "editable": true,
-        "slideshow": {
-          "slide_type": ""
-        },
-        "tags": []
-      },
-      "source": [
-        "# 7. Custom (Struct) Data Types\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "541308b1-65b3-4856-adb9-aa1f039f0396",
-      "metadata": {},
-      "source": [
-        "So far, we've seen how to use `parallel` with input arrays composed of numeric values (ints and floats). A powerful feature of `parallel` is that it can also work with \"struct\" values, i.e., values that are in turn composed of more than one value. "
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "df4ed2b7-a379-4ff4-941d-f0da11ac0a7a",
-      "metadata": {},
-      "source": [
-        "For example, consider a sequence of RGB values, like those used in graphics applications. Each RGB value represents a pixel's color and consists of three components: <font color=\"red\">**red**</font>, <font color=\"green\">**green**</font>, and <font color=\"blue\">**blue**</font> intensity levels."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "6fa5625f-a13f-468a-baa1-84b7e5abd427",
-      "metadata": {},
-      "source": [
-        "The code below shows how you can use `parallel` to find the pixel with the highest <font color=\"green\">**green**</font> intensity level."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "b9281737-5422-41a3-8052-7ade54a96de0",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# use `@gpu_struct` to define the data type of each value:\n",
-        "@comp.gpu_struct\n",
-        "class Pixel:\n",
-        "    r: np.int32\n",
-        "    g: np.int32\n",
-        "    b: np.int32\n",
-        "\n",
-        "# Define a reduction operation that operates on two `Pixel` objects:\n",
-        "def max_g_value(x, y):\n",
-        "    return x if x.g > y.g else y\n",
-        "\n",
-        "# Prepare the input and output arrays. These are just CuPy arrays:\n",
-        "dtype = np.dtype([(\"r\", np.int32), (\"g\", np.int32), (\"b\", np.int32)], align=True)  # alternately, use `Pixel.dtype`\n",
-        "d_rgb = cp.random.randint(0, 256, (10, 3), dtype=np.int32).view(dtype)\n",
-        "d_out = cp.empty(1, dtype)\n",
-        "\n",
-        "# Define the initial value for the reduction. This must be a `Pixel` object:\n",
-        "h_init = Pixel(0, 0, 0)\n",
-        "\n",
-        "# Perform the reduction.\n",
-        "comp.reduce_into(d_rgb, d_out, max_g_value, d_rgb.size, h_init)\n",
-        "\n",
-        "# Verify the result.\n",
-        "print(f\"Input RGB values: \\n {d_rgb.get()}\")\n",
-        "result = d_out.get()\n",
-        "print(f\"Pixel with greatest 'g' intensity: {result}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "2567bc4d",
-      "metadata": {},
-      "source": [
-        "# 8. Working with Iterators\n",
-        "\n",
-        "Now you have a taste for how to use `parallel` with **custom ops** and **custom data types**. _Iterators_ are another powerful tool in your toolbox for solving more complex problems.\n",
-        "\n",
-        "Iterators represent streams of data that are computed \"on-the-fly\". Unlike arrays, iterators do not require any memory allocation, and thus can represent huge sequences without consuming valuable GPU memory. Iterators can be used as inputs (and sometimes outputs) to algorithms in place of arrays.\n",
-        "\n",
-        "Note that \"iterators\" in the context of the `parallel` library is distinct from the concept of [iterators](https://docs.python.org/3/glossary.html#term-iterator) in the Python language. "
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "31f350bc-3825-49c3-a858-647aa34177aa",
-      "metadata": {},
-      "source": [
-        "## 8.1 CountingIterators and ConstantIterator"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "514e4183-fc23-4355-8ad4-7b9989c9eaf4",
-      "metadata": {},
-      "source": [
-        "A `CountingIterator` represents the sequence `a, a + 1, a + 2, a + 3,.... `. In the following example, we use a `CountingIterator` as the input to `reduce_into` to compute the sum $1 + 2 + 3 + 4 + 5 = 15$."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "604e5f6b",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Prepare the inputs and outputs:\n",
-        "it_input = comp.CountingIterator(np.int32(1))  # represents the sequence 1, 2, 3, ....\n",
-        "d_output = cp.empty(1, dtype=np.int32)\n",
-        "h_init = np.array([0], dtype=np.int32)\n",
-        "\n",
-        "# Perform the reduction.\n",
-        "comp.reduce_into(it_input, d_output, comp.OpKind.PLUS, 5, h_init)  # compute the reduction for `5` input items\n",
-        "\n",
-        "print(f\"Sum: {d_output.get()}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "dec7e377",
-      "metadata": {},
-      "source": [
-        "A `ConstantIterator` represents the sequence `a, a, a, ...`. In the following example, we use a `ConstantIterator` as one of the inputs to `binary_transform`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "fcff9cb6-ac1d-43e6-b439-ba7c89029b8c",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Prepare the input and output arrays.\n",
-        "d_in1 = cp.asarray([2, 8, 9, 6, 3], dtype=np.int32)\n",
-        "it_in2 = comp.ConstantIterator(np.int32(1))\n",
-        "d_out = cp.empty_like(d_in1)\n",
-        "\n",
-        "# Perform the binary transform.\n",
-        "comp.binary_transform(d_in1, it_in2, d_out, comp.OpKind.PLUS, len(d_in1))\n",
-        "print(f\"Result of binary transform: {d_out.get()}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "84e88c3f-8325-47f5-a759-3355b73083f3",
-      "metadata": {},
-      "source": [
-        "## 8.2 TransformIterator"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "f8231ca7-8daf-4208-9739-b77fd06719f9",
-      "metadata": {},
-      "source": [
-        "`TransformIterator` provides a way to compose operations by applying a function to each element as it's accessed. The following code is similar to the `CountingIterator` example above, but it wraps the iterator with a `TransformIterator` to compute the sum $1^2 + 2^2 + 3^2 + 4^2 + 5^2 = 55$."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "21ce538d-260c-49a5-83f7-3988aa536803",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Define the transform operation.\n",
-        "def square(a):\n",
-        "    return a**2\n",
-        "\n",
-        "# prepare the inputs and output.\n",
-        "it_count = comp.CountingIterator(np.int32(1))  # represents the sequence 1, 2, 3, ....\n",
-        "it_input = comp.TransformIterator(it_count, square)  # represents the sequence 1**2, 2**2, 3**2, ...\n",
-        "d_output = cp.empty(1, dtype=np.int32)\n",
-        "h_init = np.array([0], dtype=np.int32)\n",
-        "\n",
-        "# Perform the reduction.\n",
-        "comp.reduce_into(it_input, d_output, comp.OpKind.PLUS, 5, h_init)  # compute the reduction for `5` input items\n",
-        "\n",
-        "print(f\"Sum: {d_output.get()}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "a3c05bf4-f477-456b-8238-aa5fc8bbce18",
-      "metadata": {},
-      "source": [
-        "You can also wrap an array with a `TransformIterator`:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "c2301f1f-5a56-4075-9c46-c001bc9bab05",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "d_arr = cp.asarray([2, 3, 5, 1, 6, 7, 8, 4], dtype=np.int32)\n",
-        "it_input = comp.TransformIterator(d_arr, square)  # represents the sequence [2**2, 3**2, ... 4**2]\n",
-        "d_output = cp.empty(1, dtype=np.int32)\n",
-        "h_init = np.array([0], dtype=np.int32)\n",
-        "\n",
-        "# Perform the reduction.\n",
-        "comp.reduce_into(it_input, d_output, comp.OpKind.PLUS, len(d_arr), h_init)\n",
-        "\n",
-        "print(f\"Sum: {d_output.get()}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "10c7736a-248b-4512-a262-cf452c46de8e",
-      "metadata": {},
-      "source": [
-        "Finally, you can use `TransformOutputIterator` as the output of an algorithm, to apply a function to the result as it's being written.\n",
-        "\n",
-        "⚠️ Note that when using `TransformOutputIterator`, you must currently provide explicit type annotations for the transform function."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "a1142521-452a-4a3a-9f4d-2a1f01251a3d",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "d_arr = cp.asarray([2, 3, 5, 1, 6, 7, 8, 4], dtype=np.float32)\n",
-        "it_input = comp.TransformIterator(d_arr, square)  # represents the sequence [2**2, 3**2, ... 4**2]\n",
-        "d_out = cp.empty(1, dtype=np.float32)\n",
-        "\n",
-        "# provide type annotations when using `TransformOutputIterator`\n",
-        "def sqrt(a: np.float32) -> np.float32:\n",
-        "    return a**2\n",
-        "\n",
-        "it_output = comp.TransformOutputIterator(d_out, sqrt)\n",
-        "\n",
-        "h_init = np.array([0], dtype=np.float32)\n",
-        "\n",
-        "# Perform the reduction.\n",
-        "comp.reduce_into(it_input, it_output, comp.OpKind.PLUS, len(d_arr), h_init)  # compute the reduction for `5` input items\n",
-        "\n",
-        "print(f\"Sum: {d_out.get()}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "d98c08e0-629b-4428-96c9-59cc3c132b2d",
-      "metadata": {},
-      "source": [
-        "## 8.3 ZipIterator"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "8a3d7a82-a627-41dc-923d-242a23eae810",
-      "metadata": {},
-      "source": [
-        "A `ZipIterator` combines multiple iterators (or arrays) into a single iterator. To access the individual components of any element of a `ZipIterator`, use numeric indexing:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "090a5d75-1518-4102-8955-6a8e99e6d617",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "d_in1 = cp.asarray([2, 3, 5, 1, 6, 7, 8, 4], dtype=np.int32)\n",
-        "d_in2 = cp.asarray([7, 7, 9, 3, 1, 2, 6, 0], dtype=np.int32)\n",
-        "it_in3 = comp.CountingIterator(np.int32(0))\n",
-        "it_input = comp.ZipIterator(d_in1, d_in2, it_in3)\n",
-        "\n",
-        "def op(x):\n",
-        "    return x[0] + x[1] + x[2]\n",
-        "\n",
-        "d_output = cp.empty_like(d_in1)\n",
-        "comp.unary_transform(it_input, d_output, op, len(d_in1))\n",
-        "\n",
-        "print(f\"Result: {d_output.get()}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "e0cf37c9-5d7e-4b2d-a619-e4bd8dbeab46",
-      "metadata": {},
-      "source": [
-        "In the example below, we compute the `min` and `max` of a sequence within a single call to `reduce_into`, using `ZipIterator`. Note the need to define `MinMax` to specify the output type of `minmax_op`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "b44215d1-8bad-454b-8a0e-a87895dc1da5",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "@comp.gpu_struct\n",
-        "class MinMax:\n",
-        "    min_value: np.int32\n",
-        "    max_value: np.int32\n",
-        "\n",
-        "def minmax_op(x, y):\n",
-        "    return MinMax(min(x[0], y[0]), max(x[1], y[1]))\n",
-        "\n",
-        "d_in = cp.asarray([2, 3, 5, 1, 6, 7, 8, 4], dtype=np.int32)\n",
-        "\n",
-        "it_input = comp.ZipIterator(d_in, d_in)\n",
-        "d_output = cp.empty(2, dtype=np.int32).view(MinMax.dtype)\n",
-        "\n",
-        "SMALLEST_INT = np.iinfo(np.int32).min\n",
-        "LARGEST_INT = np.iinfo(np.int32).max\n",
-        "h_init = MinMax(LARGEST_INT, SMALLEST_INT)\n",
-        "\n",
-        "comp.reduce_into(it_input, d_output, minmax_op, len(d_in), h_init)\n",
-        "\n",
-        "print(f\"Min value: {d_output.get()[0]['min_value']}\")\n",
-        "print(f\"Max value: {d_output.get()[0]['max_value']}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "df2acfef-656b-4c07-9b29-6e8de94add5e",
-      "metadata": {},
-      "source": [
-        "### Iterator Composition\n",
-        "\n",
-        "You can chain multiple iterator types together to create sophisticated data processing pipelines without intermediate storage."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "82a88473",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Example: Sum of squares of even numbers from 1 to 20\n",
-        "def square_if_even(x):\n",
-        "    \"\"\"Square the number if it's even, otherwise return 0\"\"\"\n",
-        "    return (x * x) if (x % 2 == 0) else 0\n",
-        "\n",
-        "def add_op(a, b):\n",
-        "    return a + b\n",
-        "\n",
-        "# Chain operations: generate numbers → filter/square evens → sum\n",
-        "counting_it = comp.CountingIterator(np.int32(1))  # 1, 2, 3, ..., 20\n",
-        "transform_it = comp.TransformIterator(counting_it, square_if_even)  # 0, 4, 0, 16, 0, 36, ...\n",
-        "\n",
-        "d_result = cp.empty(1, dtype=np.int32)\n",
-        "h_init = np.array([0], dtype=np.int32)\n",
-        "\n",
-        "comp.reduce_into(transform_it, d_result, add_op, 20, h_init)\n",
-        "\n",
-        "# Verify: even numbers 2,4,6,8,10,12,14,16,18,20 -> squares 4,16,36,64,100,144,196,256,324,400\n",
-        "evens = [x for x in range(1, 21) if x % 2 == 0]\n",
-        "expected = sum(x * x for x in evens)\n",
-        "\n",
-        "print(f\"Numbers 1-20: even squares sum\")\n",
-        "print(f\"Even numbers: {evens}\")\n",
-        "print(f\"Their squares: {[x*x for x in evens]}\")\n",
-        "print(f\"Iterator result: {d_result.get()[0]}\")\n",
-        "print(f\"Expected result: {expected}\")\n",
-        "print(f\"Correct: {d_result.get()[0] == expected}\")\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "8c859995",
-      "metadata": {
-        "editable": true,
-        "slideshow": {
-          "slide_type": ""
-        },
-        "tags": []
-      },
-      "source": [
-        "# 9. Capstone Exercise: Implementing Running Average\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "13f16d8f-76e9-40d6-b9ed-159c68379d02",
-      "metadata": {},
-      "source": [
-        "In this example, you'll implement the running average of a sequence, using a single call to the [inclusive_scan](https://nvidia.github.io/cccl/unstable/python/compute_api.html#cuda.compute.algorithms.inclusive_scan) API. To do this, you'll have to piece together many of the concepts we've learned about so far."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "5ee1a1e5-ef83-470d-8991-a2808b8b6ab9",
-      "metadata": {
-        "editable": true,
-        "slideshow": {
-          "slide_type": ""
-        },
-        "tags": []
-      },
-      "outputs": [],
-      "source": [
-        "@comp.gpu_struct\n",
-        "class SumAndCount:\n",
-        "    # begin TODO\n",
-        "\n",
-        "    # end TODO\n",
-        "\n",
-        "def reduce_op(x, y) -> SumAndCount:\n",
-        "    # begin TODO\n",
-        "\n",
-        "    # end TODO\n",
-        "\n",
-        "def compute_running_average(x: SumAndCount) -> np.float32:\n",
-        "    # begin TODO\n",
-        "\n",
-        "    # end TODO\n",
-        "\n",
-        "d_input = cp.array([2, 3, 5, 1, 7, 6, 8, 4], dtype=np.float32)\n",
-        "d_output = cp.empty(len(d_input), dtype=np.float32)\n",
-        "h_init = SumAndCount(0, 0)\n",
-        "\n",
-        "it_input = comp.ZipIterator(d_input, comp.ConstantIterator(np.int32(1)))\n",
-        "it_output = comp.TransformOutputIterator(d_output, compute_running_average)\n",
-        "\n",
-        "# Perform the reduction.\n",
-        "comp.inclusive_scan(it_input, it_output, reduce_op, h_init, len(d_input))\n",
-        "\n",
-        "print(d_input)\n",
-        "\n",
-        "h_input = d_input.get()\n",
-        "expected = h_input.cumsum() / np.arange(1, len(h_input) + 1)\n",
-        "\n",
-        "print(f\"Running average result: {d_output}\")\n",
-        "np.testing.assert_allclose(d_output.get(), expected)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "55a22e1a",
-      "metadata": {
-        "editable": true,
-        "slideshow": {
-          "slide_type": ""
-        },
-        "tags": []
-      },
-      "source": [
-        "## Resources\n",
-        "\n",
-        "* `cuda-cccl` Documentation: https://nvidia.github.io/cccl/unstable/python/\n",
-        "* `parallel` API Reference: https://nvidia.github.io/cccl/unstable/python/compute_api.html#cuda-compute-api-reference"
-      ]
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "gpuType": "T4",
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
-      "language": "python",
-      "name": "python3"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 5
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "42b6145d",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "source": [
+    "## Table of Contents\n",
+    "\n",
+    "1. [Introduction: CCCL & Customizing Algorithms](#1.-Introduction:-CCCL-&-Customizing-Algorithms)\n",
+    "   - 1.1 [What is cuda-cccl?](#1.1-What-is-cuda-cccl?)\n",
+    "   - 1.2 [When to use it?](#1.2-When-to-use-it?)\n",
+    "   - 1.3 [Installation](#1.3-Installation)\n",
+    "2. [Basic Reductions](#2.-Basic-Reductions)\n",
+    "   - 2.1 [Using `reduce_into()` to compute the sum of a sequence](#2.1-Using-`reduce_into()`-to-compute-the-sum-of-a-sequence)\n",
+    "   - 2.2 [Exercise: computing the minimum value](#2.2-Exercise:-computing-the-minimum-value)\n",
+    "3. [Custom Reductions](#3.-Custom-Reductions)\n",
+    "   - 3.1 [Example: sum of even values](#3.1-Example:-sum-of-even-values)\n",
+    "   - 3.2 [Performance Comparison: Custom vs. Naive CuPy](#3.2-Performance-Comparison:-Custom-vs.-Naive-CuPy)\n",
+    "4. [Scanning Algorithms](#4.-Scanning-Algorithms)\n",
+    "   - 4.1 [What is a Scan?](#4.1-What-is-a-Scan?)\n",
+    "   - 4.2 [Maximum Scan Example](#4.2-Maximum-Scan-Example)\n",
+    "5. [Sorting Algorithms](#5.-Sorting-Algorithms)\n",
+    "   - 5.1 [Merge Sort](#5.1-Merge-Sort)\n",
+    "     - 5.1.1 [Exercise: sort by the last digit](#5.1.1-Exercise:-sort-by-the-last-digit)\n",
+    "   - 5.2 [Radix Sort](#5.2-Radix-Sort)\n",
+    "6. [Transformation Algorithms](#6.-Transformation-Algorithms)\n",
+    "   - 6.1 [Unary and Binary Transform](#6.1-Unary-and-Binary-Transform)\n",
+    "   - 6.2 [Data Normalization with Transform](#6.2-Data-Normalization-with-Transform)\n",
+    "   - 6.3 [Transform with Iterators for Memory Efficiency](#6.3-Transform-with-Iterators-for-Memory-Efficiency)\n",
+    "7. [Custom (Struct) Data Types](#7.-Custom-(Struct)-Data-Types)\n",
+    "8. [Working with Iterators](#8.-Working-with-Iterators)\n",
+    "   - 8.1 [CountingIterators and ConstantIterator](#8.1-CountingIterators-and-ConstantIterator)\n",
+    "   - 8.2 [TransformIterator](#8.2-TransformIterator)\n",
+    "   - 8.3 [ZipIterator](#8.3-ZipIterator)\n",
+    "9. [Capstone Exercise: Implementing Running Average](#9.-Capstone-Exercise:-Implementing-Running-Average)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "575a8a12",
+   "metadata": {},
+   "source": [
+    "# 1. Introduction: CCCL & Customizing Algorithms\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "008c05bf-5615-4b0b-8294-d66795f1f155",
+   "metadata": {},
+   "source": [
+    "## 1.1 What is cuda-cccl?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "98385a37-45b5-4b42-8a30-22112d2292df",
+   "metadata": {},
+   "source": [
+    "The [CUDA Core Compute Libraries (CCCL)](https://nvidia.github.io/cccl/unstable/python/) provide high-quality, high-performance abstractions for CUDA development in Python. The `cuda-cccl` Python package is composed of two indepdendent subpackages:\n",
+    "\n",
+    "* `cuda.compute` is a **parallel algorithms library** containing algorithms like `reduce`, `transform`, `scan` and `sort`. These  can be combined to implement more complex algorithms, while delivering the performance of hand-optimized CUDA kernels, portable across different GPU architectures. They are general-purpose and **designed to be used with CuPy, PyTorch and other array/tensor frameworks.**.\n",
+    "\n",
+    "* `cuda.coop` is a lower-level library containing **cooperative algorithms meant to be used within (numba) CUDA kernels**.  Examples include _block-wide reduction_ and _warp-wide scan_, providing numba CUDA kernel developers with building blocks to create speed-of-light, custom kernels."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "79acd5e0-640c-4b61-8528-f57904d0ca95",
+   "metadata": {},
+   "source": [
+    "## 1.2 When to use it?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "14f05b64-f709-41de-8899-add8b4c3dcb3",
+   "metadata": {},
+   "source": [
+    "`cccl` provides a level of abstraction in between tensor libraries and raw CUDA kernels.\n",
+    "\n",
+    "- If you want to implement custom functionality that can not easily and efficiently be expressed using PyTorch/CuPy operations, you can reach for `cuda.compute` before resorting to writing CUDA kernels.\n",
+    "- If you _do_ need to write a kernel, you can often make use of the block-level and warp-level primitives offered by `cuda.coop` to write your kernel much more efficiently and concisely.  "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3e2b1188-ce5e-4a10-84a1-ed6ace92922f",
+   "metadata": {},
+   "source": [
+    "<img src=\"images/cccl-spectrum.png\" width=\"1000\">"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "71178c05",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "source": [
+    "## 1.3 Installation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fc78704b-cc16-4f3a-8a76-a1ecb4acd8e3",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "source": [
+    "The command below installs `cuda-cccl` along with pieces of the CUDA toolkit it needs. You'll only need to do this in Google Colab."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "23c21a66-dcf9-44f5-8262-78aa77754523",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "if os.getenv(\"COLAB_RELEASE_TAG\") and not os.path.exists(\"/ach-installed\"): # If running in Google Colab:\n",
+    "  !pip uninstall \"cuda-python\" --yes > /dev/null\n",
+    "  !pip install \"numba-cuda\" \"cuda-cccl[test-cu12]\" > /dev/null 2>&1\n",
+    "  open(\"/ach-installed\", \"a\").close()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "10355920-9bfc-4788-bfe7-ab99440a6d98",
+   "metadata": {},
+   "source": [
+    "The `[test-cu12]` extras installs CuPy, which we will use in our examples. It is not strictly a dependency of `cuda-cccl` - you can use any array-like object (like PyTorch tensors) as well."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bca5a026-db41-4079-808e-b8c3c7196c7a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import cupy  as cp\n",
+    "import cuda.compute as comp"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "01908d54",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "source": [
+    "# 2. Basic Reductions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2d468957-1411-4cae-a050-a75692452cc2",
+   "metadata": {},
+   "source": [
+    "A **reduction** takes many values and combines them into a single result using a binary operation.\n",
+    "\n",
+    "As a simple example, consider a sequence of values like $[2, 3, 5, 1, 7, 6, 8, 4]$. The *sum* of the values of that sequence is a reduction using _addition_ as the binary operation: $(2 + 3 + 5 + 1 + 7 + 6 + 8 + 4) = 36$. Similarly, the *maximum value* can be obtained by performing a reduction using `max(a, b)` as the binary operation."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "83065c06-aeab-4d26-9d18-69edb8462c2d",
+   "metadata": {},
+   "source": [
+    "A reduction can be computed in parallel. Typically this is done using a \"tree\" reduction where elements are combined in pairs across multiple levels, resembling the structure of a binary tree. At each level, the number of elements is halved as partial results are computed in parallel. This continues until a single final result is obtained at the root of the tree.\n",
+    "\n",
+    "<img src=\"https://upload.wikimedia.org/wikipedia/commons/e/ee/Binomial_tree.gif\" width=\"600\">\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "01d0feb1-7ed0-42dc-bb83-8b017d89a2a6",
+   "metadata": {},
+   "source": [
+    "If you know some CUDA, you can quite easily write a kernel to implement this kind of parallel reduction. However, optimizing it for the specific CUDA architecture of your device, and generalizing for different data types and sizes can be difficult.\n",
+    "\n",
+    "This is where `cuda.compute` comes in. It provides optimized implementations of algorithms like reduction that give the best possible performance."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "23cb8fb8-8e35-4179-8a66-2c3b5e6077ae",
+   "metadata": {},
+   "source": [
+    "## 2.1 Using `reduce_into()` to compute the sum of a sequence"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9c8f6367-d7f9-4030-bc57-4b8920299b47",
+   "metadata": {},
+   "source": [
+    "`cuda.compute` provides a `reduce_into()` function to compute general reductions:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "adce5791",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "Using `reduce_into()` to compute the sum of a sequence\n",
+    "\"\"\"\n",
+    "\n",
+    "# Prepare the inputs and outputs.\n",
+    "d_input = cp.array([2, 3, 5, 1, 7, 6, 8, 4], dtype=np.int32)  # input sequence, a CuPy (device) array\n",
+    "d_output = cp.empty(1, dtype=np.int32)  # array which will hold the result, a CuPy (device) array of size 1\n",
+    "h_init = np.array([0], dtype=np.int32)  # initial value of the reduction, a NumPy (host) array of size 1\n",
+    "\n",
+    "# Perform the reduction.\n",
+    "comp.reduce_into(d_input, d_output, comp.OpKind.PLUS, len(d_input), h_init)\n",
+    "\n",
+    "print(d_input)\n",
+    "# Verify the result.\n",
+    "expected_output = 36\n",
+    "assert (d_output == expected_output).all()\n",
+    "result = d_output[0]\n",
+    "print(f\"Sum reduction result: {result}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2f00121b-bfb1-4b37-9651-230386d9c256",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "source": [
+    "## 2.2 Exercise: computing the minimum value"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8f5e9ffc-89cd-4859-97ea-93abbb8b3f4b",
+   "metadata": {},
+   "source": [
+    "`reduce_into()` can be used to compute other reductions "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9241706a-b152-41b3-bba7-d281c9e43675",
+   "metadata": {},
+   "source": [
+    "Similar to the examples above, below is an incomplete code snippet for computing the minimum value of a sequence. Complete the section between the comments `begin TODO` and `end TODO` to use `reduce_into()` to compute the minimum."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "44cb3daf-1a82-4af3-965e-b0d4be56b17e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "Using `reduce_into()` to compute the minimum value of a sequence\n",
+    "\"\"\"\n",
+    "\n",
+    "d_input = cp.array([-2, 3, 5, 1, 7, -6, 8, -4], dtype=np.int32)\n",
+    "d_output = cp.empty(1, dtype=np.int32)\n",
+    "\n",
+    "# begin TODO\n",
+    "\n",
+    "\n",
+    "# end TODO\n",
+    "\n",
+    "expected_output = -6\n",
+    "assert (d_output == expected_output).all()\n",
+    "result = d_output[0]\n",
+    "print(f\"Min reduction result: {result}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9839aacd-7256-484c-8205-e48068f3217b",
+   "metadata": {},
+   "source": [
+    "# 3. Custom Reductions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9b74da96-38ff-434a-87fa-bba49e37bf5a",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "source": [
+    "## 3.1 Example: sum of even values"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a0395884-0786-4b75-9442-046609041439",
+   "metadata": {},
+   "source": [
+    "At this point, you might be thinking:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f09ef69c-07d1-4ca7-af22-89f105d7c532",
+   "metadata": {},
+   "source": [
+    "> **_Umm, can't I just use CuPy or PyTorch to compute sum or max?_**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cd42b854-3576-4cf8-9880-f1aed514a10b",
+   "metadata": {},
+   "source": [
+    "Of course, given a CuPy array, it's trivial to do simple reductions like `sum`, `min` or `max`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "61248d58-dcb7-4bf2-9b8b-9669ea2cd3d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "d_input = cp.array([-2, 3, 5, 1, 7, -6, 8, -4], dtype=np.int32)\n",
+    "\n",
+    "print(f\"Sum using cp.sum: {cp.sum(d_input)}\")\n",
+    "print(f\"Max value using cp.max: {cp.max(d_input)}\")\n",
+    "print(f\"Min value using cp.min: {cp.min(d_input)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b96e0729-87d3-4423-ac13-28c5d34e3786",
+   "metadata": {},
+   "source": [
+    "The benefit of `cuda-cccl` is more apparent when you want to do custom operations. For example, rather than just computing a straightforward `sum`, let's say we wanted to compute the sum of **only even values** in a sequence. Naively, here's how to do that with CuPy:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "929b31ba-ce77-4fcd-9750-5efa0c13c346",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "d_input = cp.array([2, 3, 5, 1, 7, 6, 8, 4], dtype=np.int32)\n",
+    "result = (d_input[d_input % 2 == 0]).sum()\n",
+    "print(f\"Sum of even values with CuPy: {result}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "48fd15a3-f616-4337-a7d7-c81c999a73f7",
+   "metadata": {},
+   "source": [
+    "Now, let's do the same thing with `parallel`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "03947ed8-c6f9-49ae-b382-db7a0ef14931",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "Using `reduce_into()` with a custom binary operation\n",
+    "\"\"\"\n",
+    "\n",
+    "# Define a custom binary operation for the reduction.\n",
+    "def sum_even_op(a, b):\n",
+    "    return (a if a % 2 == 0 else 0) + (b if b % 2 == 0 else 0)\n",
+    "\n",
+    "d_input = cp.array([2, 3, 5, 1, 7, 6, 8, 4], dtype=np.int32)\n",
+    "d_output = cp.empty(1, dtype=np.int32)\n",
+    "h_init = np.array([0], dtype=np.int32)\n",
+    "\n",
+    "# Call `reduce_into()` passing the function above for the binary operation:\n",
+    "comp.reduce_into(d_input, d_output, sum_even_op, len(d_input), h_init)\n",
+    "result = d_output.get()[0]\n",
+    "print(f\"Sum of even values with `cuda.compute`: {result}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e7c9205d-7668-4a72-a01d-f57e2d1a3bb1",
+   "metadata": {},
+   "source": [
+    "We got the same result using `cuda.compute`, but we had to write significantly more code. Is it worth it? Below is a small benchmarking script comparing timings for a range of input sizes:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "43afe1e9-c870-43e0-b884-2c7f93581869",
+   "metadata": {},
+   "source": [
+    "## 3.2 Performance Comparison: Custom vs. Naive CuPy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "729a1867-3a21-4f09-af97-f02d0a562d9f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "Compare the performance of the `parallel` implementation with a naive CuPy implementation\n",
+    "\"\"\"\n",
+    "\n",
+    "import timeit\n",
+    "\n",
+    "def evens_sum_cupy(d_input, d_output, h_init):\n",
+    "    # ignore h_init\n",
+    "    cp.sum(d_input[d_input % 2 == 0], out=d_output[0])\n",
+    "\n",
+    "def evens_sum_cccl(d_input, d_output, h_init):\n",
+    "    # note, using `op` as the binary operation, rather than `OpKind.PLUS`:\n",
+    "    comp.reduce_into(d_input, d_output, sum_even_op, len(d_input), h_init)\n",
+    "\n",
+    "def time_gpu_func(f, *args, **kwargs):\n",
+    "    cp.cuda.Device().synchronize()\n",
+    "    t1 = timeit.default_timer()\n",
+    "    n = 1_000\n",
+    "    for i in range(n):\n",
+    "        f(*args, **kwargs)\n",
+    "        cp.cuda.Device().synchronize()\n",
+    "    t2 = timeit.default_timer()\n",
+    "    return t2 - t1\n",
+    "\n",
+    "sizes = [10_000, 100_000, 1_000_000, 10_000_000, 100_000_000]\n",
+    "cccl_times = []\n",
+    "cp_times = []\n",
+    "\n",
+    "for n in sizes:\n",
+    "    d_input = cp.random.randint(low=0, high=10, size=n, dtype=np.int32)\n",
+    "    d_out = cp.empty(1, dtype=np.int32)\n",
+    "    h_init = np.array([0], dtype=np.int32)\n",
+    "\n",
+    "    cccl_times.append(time_gpu_func(evens_sum_cccl, d_input, d_out, h_init))\n",
+    "    cp_times.append(time_gpu_func(evens_sum_cupy, d_input, d_out, h_init))\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "# Plotting\n",
+    "fig = plt.figure(figsize=(10, 5))\n",
+    "plt.loglog(sizes, cccl_times, marker='o', label='cuda.ccl')\n",
+    "plt.loglog(sizes, cp_times, marker='s', label='CuPy')\n",
+    "\n",
+    "# Annotate each cuda.ccl point with speedup vs CuPy\n",
+    "for x, t_cccl, t_cp in zip(sizes, cccl_times, cp_times):\n",
+    "    speedup = t_cp / t_cccl\n",
+    "    label = f\"{speedup:.1f}x faster\"\n",
+    "    plt.annotate(label,\n",
+    "                 (x, t_cccl),\n",
+    "                 textcoords=\"offset points\",\n",
+    "                 xytext=(5, -10),  # offset position\n",
+    "                 ha='left',\n",
+    "                 fontsize=9,\n",
+    "                 color='green')\n",
+    "\n",
+    "# Labels and title\n",
+    "plt.xlabel('Input Size')\n",
+    "plt.ylabel('Time (seconds)')\n",
+    "plt.title('Timing Comparison for evens_sum.')\n",
+    "plt.legend()\n",
+    "plt.grid(True)\n",
+    "plt.tight_layout()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a0a0d085-75ab-4836-afb6-72ec1abd1d6a",
+   "metadata": {},
+   "source": [
+    "We see that using `cuda.compute` is much faster than our naive CuPy approach. This is because:\n",
+    "\n",
+    "* Operator fusion: the CuPy operation `x[x % 2 == 0]).sum()` is actually 4 separate operations (and at least 4 separate CUDA kernel invocations). With `cuda.compute`, we have a single call to `reduce_into()` that does all the computation.\n",
+    "* No intermediate memory allocations.\n",
+    "* Lesser Python overhead: `cuda.compute` is a lower-level library. You don't have to jump through multiple layers of Python before invoking device code."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a987e11b-777b-4da3-81f8-5808c0dd8836",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "source": [
+    "# 4. Scanning Algorithms\n",
+    "\n",
+    "## 4.1 What is a Scan?\n",
+    "\n",
+    "A **scan** (also called prefix sum) computes a running total of elements. For each position, it shows the cumulative result up to that point.\n",
+    "\n",
+    "**Two types of scans:**\n",
+    "* **Inclusive scan**: Includes the current element in the sum\n",
+    "* **Exclusive scan**: Excludes the current element (shifts results)\n",
+    "\n",
+    "**Visual example:**\n",
+    "\n",
+    "```\n",
+    "Input:     [3, 1, 4, 1, 5]\n",
+    "Inclusive: [3, 4, 8, 9, 14]  (3, 3+1, 3+1+4, 3+1+4+1, 3+1+4+1+5)\n",
+    "Exclusive: [0, 3, 4, 8, 9]   (0, 3, 3+1, 3+1+4, 3+1+4+1)\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fca0df7f",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "d_input = cp.array([3, 1, 4, 1, 5, 9, 2, 6], dtype=np.int32)\n",
+    "d_inclusive = cp.empty_like(d_input)\n",
+    "d_exclusive = cp.empty_like(d_input)\n",
+    "h_init = np.array([0], dtype=np.int32)\n",
+    "\n",
+    "def add_op(a, b):\n",
+    "    return a + b\n",
+    "\n",
+    "comp.inclusive_scan(d_input, d_inclusive, add_op, h_init, len(d_input))\n",
+    "comp.exclusive_scan(d_input, d_exclusive, add_op, h_init, len(d_input))\n",
+    "\n",
+    "print(f\"Input:           {d_input.get()}\")\n",
+    "print(f\"Inclusive scan:  {d_inclusive.get()}\")\n",
+    "print(f\"Exclusive scan:  {d_exclusive.get()}\")\n",
+    "\n",
+    "# Verify with NumPy\n",
+    "np_inclusive = np.cumsum(d_input.get())\n",
+    "np_exclusive = np.concatenate([[0], np_inclusive[:-1]])\n",
+    "np.testing.assert_allclose(d_inclusive.get(), np_inclusive)\n",
+    "np.testing.assert_allclose(d_exclusive.get(), np_exclusive)\n",
+    "print(f\"NumPy inclusive:    {np_inclusive}\")\n",
+    "print(f\"NumPy exclusive:    {np_exclusive}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d5c4b010",
+   "metadata": {},
+   "source": [
+    "## 4.2 Maximum Scan Example\n",
+    "\n",
+    "Scans aren't limited to addition. Here's an example using maximum operation to find running maximum.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "088f30c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Running maximum example\n",
+    "d_input = cp.array([3, 7, 2, 9, 1, 8, 4, 6], dtype=np.int32)\n",
+    "d_output = cp.empty_like(d_input)\n",
+    "\n",
+    "def max_op(a, b):\n",
+    "    return a if a > b else b\n",
+    "\n",
+    "# Start with a very small value\n",
+    "h_init = np.array([-999999], dtype=np.int32)\n",
+    "\n",
+    "# Perform inclusive scan with max operation\n",
+    "comp.inclusive_scan(d_input, d_output, max_op, h_init, len(d_input))\n",
+    "\n",
+    "print(f\"Input:       {d_input.get()}\")\n",
+    "print(f\"Running max: {d_output.get()}\")\n",
+    "\n",
+    "# Verify with NumPy\n",
+    "np_running_max = np.maximum.accumulate(d_input.get())\n",
+    "print(f\"NumPy max:   {np_running_max}\")\n",
+    "print(f\"Match:       {np.array_equal(d_output.get(), np_running_max)}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5e37c3f0-b7cb-45e6-83bf-2bbbeeab74b2",
+   "metadata": {},
+   "source": [
+    "# 5. Sorting Algorithms"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7aaa94ff",
+   "metadata": {},
+   "source": [
+    "## 5.1 Merge Sort"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e5c4043e-6dec-44f4-ab6b-15f24d0c3cfb",
+   "metadata": {},
+   "source": [
+    "The `merge_sort` function can be used to perform key-value sorting."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a893a898-e74b-4ea2-b69a-4432a342fead",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Prepare the input arrays.\n",
+    "d_in_keys = cp.asarray([-5, 0, 2, -3, 2, -3, 0, -3, -5, 2], dtype=\"int32\")\n",
+    "d_in_values = cp.asarray(\n",
+    "    [-3.2, 2.2, 1.9, 4.0, -3.9, 2.7, 0, 8.3 - 1, 2.9, 5.4], dtype=\"float32\"\n",
+    ")\n",
+    "\n",
+    "# Perform the merge sort.\n",
+    "comp.merge_sort(\n",
+    "    d_in_keys,\n",
+    "    d_in_values,\n",
+    "    d_in_keys,  # reuse input array to store output\n",
+    "    d_in_values,  # reuse input array to store output\n",
+    "    comp.OpKind.LESS,\n",
+    "    d_in_keys.size,\n",
+    ")\n",
+    "\n",
+    "print(f\"Sorted keys: {d_in_keys.get()}\")\n",
+    "print(f\"Sorted values: {d_in_values.get()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "13dafff6-7a05-43d4-bedb-1c6cc0027573",
+   "metadata": {},
+   "source": [
+    "If you just want to sort keys (with no corresponding values), just pass `None`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2e90bc0c-810e-46d9-9ef9-9f3adaeecd10",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Prepare the input and output arrays.\n",
+    "d_in_keys = cp.asarray([-5, 0, 2, -3, 2, -3, 0, -3, -5, 2], dtype=\"int32\")\n",
+    "\n",
+    "print(d_in_keys)\n",
+    "\n",
+    "# Perform the merge sort.\n",
+    "comp.merge_sort(\n",
+    "    d_in_keys,\n",
+    "    None,  # don't specify a values array\n",
+    "    d_in_keys,  # reuse input array to store output\n",
+    "    None,  # don't specify a values array\n",
+    "    comp.OpKind.LESS,\n",
+    "    d_in_keys.size,\n",
+    ")\n",
+    "\n",
+    "print(f\"Sorted keys: {d_in_keys.get()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6c6ffd71-90d1-461f-843b-96aca2990206",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "source": [
+    "### 5.1.1 Exercise: sort by the last digit"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3b91fe2a-169e-4ee6-8408-ee87cf49b481",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "source": [
+    "In this excercise, you'll use `merge_sort` with a custom comparator function to sort elements by the last digit.\n",
+    "For example, $[29, 9, 136, 1001, 72, 24, 32, 1] \\rightarrow [1001, 1, 72, 32, 24, 136, 29, 9]$."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "88e95ff4-013a-4ca9-bdf4-5e122ba0f66a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Prepare the input and output arrays.\n",
+    "d_in_keys = cp.asarray([29, 9, 136, 1001, 72, 24, 32, 1], dtype=\"int32\")\n",
+    "\n",
+    "# define the custom comparator.\n",
+    "def comparison_op(lhs, rhs):\n",
+    "    # begin TODO\n",
+    "\n",
+    "    # end TODO\n",
+    "\n",
+    "# Perform the merge sort.\n",
+    "comp.merge_sort(\n",
+    "    # begin TODO\n",
+    "\n",
+    "    # end TODO\n",
+    ")\n",
+    "\n",
+    "print(f\"Result: {d_in_keys}\")\n",
+    "expected = np.asarray([1001, 1, 72, 32, 24, 136, 29, 9], dtype=np.int32)\n",
+    "assert (d_in_keys.get() == expected).all()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b0b45660-9d80-43a9-9b7b-22eaeed7df4f",
+   "metadata": {},
+   "source": [
+    "## 5.2 Radix Sort\n",
+    "\n",
+    "The `radix_sort` function provides fast sorting for numeric types using the radix sort algorithm. Unlike merge sort, radix sort doesn't use comparisons but instead processes the bits/digits of numbers."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7f61f655",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Basic radix sort example (ascending order)\n",
+    "d_input = cp.array([64, 34, 25, 12, 22, 11, 90, 5, 77, 30], dtype=np.int32)\n",
+    "d_output = cp.empty_like(d_input)\n",
+    "\n",
+    "print(f\"Input:  {d_input.get()}\")\n",
+    "\n",
+    "# Sort in ascending order\n",
+    "comp.radix_sort(\n",
+    "    d_input,                           # Input keys\n",
+    "    d_output,                          # Output keys\n",
+    "    None,                              # Input values (none for keys-only sort)\n",
+    "    None,                              # Output values (none)\n",
+    "    comp.SortOrder.ASCENDING,          # Sort order\n",
+    "    len(d_input)                       # Number of elements\n",
+    ")\n",
+    "\n",
+    "print(f\"Sorted: {d_output.get()}\")\n",
+    "\n",
+    "# Verify sorting\n",
+    "is_sorted = all(d_output.get()[i] <= d_output.get()[i+1] for i in range(len(d_output.get())-1))\n",
+    "print(f\"Properly sorted: {is_sorted}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bdaa70a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Descending order sort\n",
+    "d_input = cp.array([3, 1, 4, 1, 5, 9, 2, 6, 5, 3], dtype=np.int32)\n",
+    "d_output = cp.empty_like(d_input)\n",
+    "\n",
+    "comp.radix_sort(\n",
+    "    d_input, d_output, None, None,\n",
+    "    comp.SortOrder.DESCENDING,    # Sort in reverse order\n",
+    "    len(d_input)\n",
+    ")\n",
+    "\n",
+    "print(f\"Input:            {d_input.get()}\")\n",
+    "print(f\"Descending sort:  {d_output.get()}\")\n",
+    "\n",
+    "# Verify descending order\n",
+    "is_descending = all(d_output.get()[i] >= d_output.get()[i+1] for i in range(len(d_output.get())-1))\n",
+    "print(f\"Properly descending: {is_descending}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "17ce2740",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Key-value sorting: sort scores while keeping student IDs aligned\n",
+    "scores = [85, 92, 78, 96, 88, 71, 94]\n",
+    "student_ids = [101, 102, 103, 104, 105, 106, 107]\n",
+    "\n",
+    "d_keys = cp.array(scores, dtype=np.int32)\n",
+    "d_values = cp.array(student_ids, dtype=np.int32)\n",
+    "d_keys_out = cp.empty_like(d_keys)\n",
+    "d_values_out = cp.empty_like(d_values)\n",
+    "\n",
+    "print(\"Before sorting:\")\n",
+    "for score, student_id in zip(scores, student_ids):\n",
+    "    print(f\"  Student {student_id}: {score}\")\n",
+    "\n",
+    "# Sort by scores (highest first), keep student IDs aligned\n",
+    "comp.radix_sort(\n",
+    "    d_keys, d_keys_out,                # Input/output keys (scores)\n",
+    "    d_values, d_values_out,            # Input/output values (student IDs)\n",
+    "    comp.SortOrder.DESCENDING,    # Highest scores first\n",
+    "    len(d_keys)\n",
+    ")\n",
+    "\n",
+    "sorted_scores = d_keys_out.get()\n",
+    "sorted_ids = d_values_out.get()\n",
+    "\n",
+    "print(\"\\nAfter sorting (by score, highest first):\")\n",
+    "for score, student_id in zip(sorted_scores, sorted_ids):\n",
+    "    print(f\"  Student {student_id}: {score}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "423fd14a",
+   "metadata": {},
+   "source": [
+    "# 6. Transformation Algorithms\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5c001e63-54ab-4943-83be-7503fea3ae0a",
+   "metadata": {},
+   "source": [
+    "## 6.1 Unary and Binary Transform\n",
+    "\n",
+    "### Unary transform"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b55614c6-6865-4bb8-941d-ae73290c7a91",
+   "metadata": {},
+   "source": [
+    "The `unary_transform` function applies a user-provided unary operation to each element of the input."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1b45d79d-87a3-47c2-9681-8810535d3a80",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Prepare the input and output arrays.\n",
+    "d_in = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32)\n",
+    "d_out = cp.empty_like(d_in)\n",
+    "\n",
+    "def double_op(a):\n",
+    "    return a * 2\n",
+    "\n",
+    "# Perform the unary transform.\n",
+    "comp.unary_transform(d_in, d_out, double_op, len(d_in))\n",
+    "print(f\"Result of unary transform: {d_out.get()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4e413c39-444b-43f8-b5e5-bd1adc18417c",
+   "metadata": {},
+   "source": [
+    "### Binary transform"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fa210a46-0c41-4c3f-94bd-954bc6add058",
+   "metadata": {},
+   "source": [
+    "The `binary_transform` function applies a user-provided binary operation to pairs of elements from two inputs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "346192c1-2318-437f-aaf5-c8e5f3b21d1e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Prepare the input and output arrays.\n",
+    "d_in1 = cp.asarray([2, 8, 9, 6, 3], dtype=np.int32)\n",
+    "d_in2 = cp.asarray([7, 2, 1, 0, -1], dtype=np.int32)\n",
+    "d_out = cp.empty_like(d_in1)\n",
+    "\n",
+    "# Perform the binary transform.\n",
+    "comp.binary_transform(d_in1, d_in2, d_out, comp.OpKind.PLUS, len(d_in1))\n",
+    "print(f\"Result of binary transform: {d_out.get()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cd76102c-c085-4893-b047-5c22eeabb20b",
+   "metadata": {},
+   "source": [
+    "## 6.2 Data Normalization with Transform\n",
+    "\n",
+    "Transform operations are commonly used in machine learning for data preprocessing, such as normalizing features to have zero mean and unit variance."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5fd56eac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Example: Normalize house prices for machine learning\n",
+    "house_prices = np.array([250000, 180000, 320000, 420000, 150000, 380000, 220000, 295000], dtype=np.float32)\n",
+    "d_prices = cp.array(house_prices)\n",
+    "d_normalized = cp.empty_like(d_prices)\n",
+    "\n",
+    "# Calculate statistics for normalization\n",
+    "price_mean = float(np.mean(house_prices))\n",
+    "price_std = float(np.std(house_prices))\n",
+    "\n",
+    "print(f\"Original prices: {house_prices}\")\n",
+    "print(f\"Mean: ${price_mean:,.0f}, Std: ${price_std:,.0f}\")\n",
+    "\n",
+    "def z_score_normalize(price):\n",
+    "    \"\"\"Z-score normalization: (x - mean) / std\"\"\"\n",
+    "    return (price - price_mean) / price_std\n",
+    "\n",
+    "# Apply normalization transformation\n",
+    "comp.unary_transform(d_prices, d_normalized, z_score_normalize, len(house_prices))\n",
+    "\n",
+    "normalized_result = d_normalized.get()\n",
+    "print(f\"Normalized prices: {normalized_result}\")\n",
+    "print(f\"Normalized mean: {np.mean(normalized_result):.6f}\")\n",
+    "print(f\"Normalized std: {np.std(normalized_result):.6f}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "17282ee7",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "source": [
+    "# 7. Custom (Struct) Data Types\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "541308b1-65b3-4856-adb9-aa1f039f0396",
+   "metadata": {},
+   "source": [
+    "So far, we've seen how to use `parallel` with input arrays composed of numeric values (ints and floats). A powerful feature of `parallel` is that it can also work with \"struct\" values, i.e., values that are in turn composed of more than one value. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "df4ed2b7-a379-4ff4-941d-f0da11ac0a7a",
+   "metadata": {},
+   "source": [
+    "For example, consider a sequence of RGB values, like those used in graphics applications. Each RGB value represents a pixel's color and consists of three components: <font color=\"red\">**red**</font>, <font color=\"green\">**green**</font>, and <font color=\"blue\">**blue**</font> intensity levels."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6fa5625f-a13f-468a-baa1-84b7e5abd427",
+   "metadata": {},
+   "source": [
+    "The code below shows how you can use `parallel` to find the pixel with the highest <font color=\"green\">**green**</font> intensity level."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b9281737-5422-41a3-8052-7ade54a96de0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# use `@gpu_struct` to define the data type of each value:\n",
+    "@comp.gpu_struct\n",
+    "class Pixel:\n",
+    "    r: np.int32\n",
+    "    g: np.int32\n",
+    "    b: np.int32\n",
+    "\n",
+    "# Define a reduction operation that operates on two `Pixel` objects:\n",
+    "def max_g_value(x, y):\n",
+    "    return x if x.g > y.g else y\n",
+    "\n",
+    "# Prepare the input and output arrays. These are just CuPy arrays:\n",
+    "dtype = np.dtype([(\"r\", np.int32), (\"g\", np.int32), (\"b\", np.int32)], align=True)  # alternately, use `Pixel.dtype`\n",
+    "d_rgb = cp.random.randint(0, 256, (10, 3), dtype=np.int32).view(dtype)\n",
+    "d_out = cp.empty(1, dtype)\n",
+    "\n",
+    "# Define the initial value for the reduction. This must be a `Pixel` object:\n",
+    "h_init = Pixel(0, 0, 0)\n",
+    "\n",
+    "# Perform the reduction.\n",
+    "comp.reduce_into(d_rgb, d_out, max_g_value, d_rgb.size, h_init)\n",
+    "\n",
+    "# Verify the result.\n",
+    "print(f\"Input RGB values: \\n {d_rgb.get()}\")\n",
+    "result = d_out.get()\n",
+    "print(f\"Pixel with greatest 'g' intensity: {result}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2567bc4d",
+   "metadata": {},
+   "source": [
+    "# 8. Working with Iterators\n",
+    "\n",
+    "Now you have a taste for how to use `parallel` with **custom ops** and **custom data types**. _Iterators_ are another powerful tool in your toolbox for solving more complex problems.\n",
+    "\n",
+    "Iterators represent streams of data that are computed \"on-the-fly\". Unlike arrays, iterators do not require any memory allocation, and thus can represent huge sequences without consuming valuable GPU memory. Iterators can be used as inputs (and sometimes outputs) to algorithms in place of arrays.\n",
+    "\n",
+    "Note that \"iterators\" in the context of the `parallel` library is distinct from the concept of [iterators](https://docs.python.org/3/glossary.html#term-iterator) in the Python language. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "31f350bc-3825-49c3-a858-647aa34177aa",
+   "metadata": {},
+   "source": [
+    "## 8.1 CountingIterators and ConstantIterator"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "514e4183-fc23-4355-8ad4-7b9989c9eaf4",
+   "metadata": {},
+   "source": [
+    "A `CountingIterator` represents the sequence `a, a + 1, a + 2, a + 3,.... `. In the following example, we use a `CountingIterator` as the input to `reduce_into` to compute the sum $1 + 2 + 3 + 4 + 5 = 15$."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "604e5f6b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Prepare the inputs and outputs:\n",
+    "it_input = comp.CountingIterator(np.int32(1))  # represents the sequence 1, 2, 3, ....\n",
+    "d_output = cp.empty(1, dtype=np.int32)\n",
+    "h_init = np.array([0], dtype=np.int32)\n",
+    "\n",
+    "# Perform the reduction.\n",
+    "comp.reduce_into(it_input, d_output, comp.OpKind.PLUS, 5, h_init)  # compute the reduction for `5` input items\n",
+    "\n",
+    "print(f\"Sum: {d_output.get()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dec7e377",
+   "metadata": {},
+   "source": [
+    "A `ConstantIterator` represents the sequence `a, a, a, ...`. In the following example, we use a `ConstantIterator` as one of the inputs to `binary_transform`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fcff9cb6-ac1d-43e6-b439-ba7c89029b8c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Prepare the input and output arrays.\n",
+    "d_in1 = cp.asarray([2, 8, 9, 6, 3], dtype=np.int32)\n",
+    "it_in2 = comp.ConstantIterator(np.int32(1))\n",
+    "d_out = cp.empty_like(d_in1)\n",
+    "\n",
+    "# Perform the binary transform.\n",
+    "comp.binary_transform(d_in1, it_in2, d_out, comp.OpKind.PLUS, len(d_in1))\n",
+    "print(f\"Result of binary transform: {d_out.get()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "84e88c3f-8325-47f5-a759-3355b73083f3",
+   "metadata": {},
+   "source": [
+    "## 8.2 TransformIterator"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f8231ca7-8daf-4208-9739-b77fd06719f9",
+   "metadata": {},
+   "source": [
+    "`TransformIterator` provides a way to compose operations by applying a function to each element as it's accessed. The following code is similar to the `CountingIterator` example above, but it wraps the iterator with a `TransformIterator` to compute the sum $1^2 + 2^2 + 3^2 + 4^2 + 5^2 = 55$."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "21ce538d-260c-49a5-83f7-3988aa536803",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the transform operation.\n",
+    "def square(a):\n",
+    "    return a**2\n",
+    "\n",
+    "# prepare the inputs and output.\n",
+    "it_count = comp.CountingIterator(np.int32(1))  # represents the sequence 1, 2, 3, ....\n",
+    "it_input = comp.TransformIterator(it_count, square)  # represents the sequence 1**2, 2**2, 3**2, ...\n",
+    "d_output = cp.empty(1, dtype=np.int32)\n",
+    "h_init = np.array([0], dtype=np.int32)\n",
+    "\n",
+    "# Perform the reduction.\n",
+    "comp.reduce_into(it_input, d_output, comp.OpKind.PLUS, 5, h_init)  # compute the reduction for `5` input items\n",
+    "\n",
+    "print(f\"Sum: {d_output.get()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a3c05bf4-f477-456b-8238-aa5fc8bbce18",
+   "metadata": {},
+   "source": [
+    "You can also wrap an array with a `TransformIterator`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2301f1f-5a56-4075-9c46-c001bc9bab05",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "d_arr = cp.asarray([2, 3, 5, 1, 6, 7, 8, 4], dtype=np.int32)\n",
+    "it_input = comp.TransformIterator(d_arr, square)  # represents the sequence [2**2, 3**2, ... 4**2]\n",
+    "d_output = cp.empty(1, dtype=np.int32)\n",
+    "h_init = np.array([0], dtype=np.int32)\n",
+    "\n",
+    "# Perform the reduction.\n",
+    "comp.reduce_into(it_input, d_output, comp.OpKind.PLUS, len(d_arr), h_init)\n",
+    "\n",
+    "print(f\"Sum: {d_output.get()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "10c7736a-248b-4512-a262-cf452c46de8e",
+   "metadata": {},
+   "source": [
+    "Finally, you can use `TransformOutputIterator` as the output of an algorithm, to apply a function to the result as it's being written.\n",
+    "\n",
+    "⚠️ Note that when using `TransformOutputIterator`, you must currently provide explicit type annotations for the transform function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1142521-452a-4a3a-9f4d-2a1f01251a3d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "d_arr = cp.asarray([2, 3, 5, 1, 6, 7, 8, 4], dtype=np.float32)\n",
+    "it_input = comp.TransformIterator(d_arr, square)  # represents the sequence [2**2, 3**2, ... 4**2]\n",
+    "d_out = cp.empty(1, dtype=np.float32)\n",
+    "\n",
+    "# provide type annotations when using `TransformOutputIterator`\n",
+    "def sqrt(a: np.float32) -> np.float32:\n",
+    "    return a**2\n",
+    "\n",
+    "it_output = comp.TransformOutputIterator(d_out, sqrt)\n",
+    "\n",
+    "h_init = np.array([0], dtype=np.float32)\n",
+    "\n",
+    "# Perform the reduction.\n",
+    "comp.reduce_into(it_input, it_output, comp.OpKind.PLUS, len(d_arr), h_init)  # compute the reduction for `5` input items\n",
+    "\n",
+    "print(f\"Sum: {d_out.get()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d98c08e0-629b-4428-96c9-59cc3c132b2d",
+   "metadata": {},
+   "source": [
+    "## 8.3 ZipIterator"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8a3d7a82-a627-41dc-923d-242a23eae810",
+   "metadata": {},
+   "source": [
+    "A `ZipIterator` combines multiple iterators (or arrays) into a single iterator. To access the individual components of any element of a `ZipIterator`, use numeric indexing:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "090a5d75-1518-4102-8955-6a8e99e6d617",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "d_in1 = cp.asarray([2, 3, 5, 1, 6, 7, 8, 4], dtype=np.int32)\n",
+    "d_in2 = cp.asarray([7, 7, 9, 3, 1, 2, 6, 0], dtype=np.int32)\n",
+    "it_in3 = comp.CountingIterator(np.int32(0))\n",
+    "it_input = comp.ZipIterator(d_in1, d_in2, it_in3)\n",
+    "\n",
+    "def op(x):\n",
+    "    return x[0] + x[1] + x[2]\n",
+    "\n",
+    "d_output = cp.empty_like(d_in1)\n",
+    "comp.unary_transform(it_input, d_output, op, len(d_in1))\n",
+    "\n",
+    "print(f\"Result: {d_output.get()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e0cf37c9-5d7e-4b2d-a619-e4bd8dbeab46",
+   "metadata": {},
+   "source": [
+    "In the example below, we compute the `min` and `max` of a sequence within a single call to `reduce_into`, using `ZipIterator`. Note the need to define `MinMax` to specify the output type of `minmax_op`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b44215d1-8bad-454b-8a0e-a87895dc1da5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@comp.gpu_struct\n",
+    "class MinMax:\n",
+    "    min_value: np.int32\n",
+    "    max_value: np.int32\n",
+    "\n",
+    "def minmax_op(x, y):\n",
+    "    return MinMax(min(x[0], y[0]), max(x[1], y[1]))\n",
+    "\n",
+    "d_in = cp.asarray([2, 3, 5, 1, 6, 7, 8, 4], dtype=np.int32)\n",
+    "\n",
+    "it_input = comp.ZipIterator(d_in, d_in)\n",
+    "d_output = cp.empty(2, dtype=np.int32).view(MinMax.dtype)\n",
+    "\n",
+    "SMALLEST_INT = np.iinfo(np.int32).min\n",
+    "LARGEST_INT = np.iinfo(np.int32).max\n",
+    "h_init = MinMax(LARGEST_INT, SMALLEST_INT)\n",
+    "\n",
+    "comp.reduce_into(it_input, d_output, minmax_op, len(d_in), h_init)\n",
+    "\n",
+    "print(f\"Min value: {d_output.get()[0]['min_value']}\")\n",
+    "print(f\"Max value: {d_output.get()[0]['max_value']}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "df2acfef-656b-4c07-9b29-6e8de94add5e",
+   "metadata": {},
+   "source": [
+    "### Iterator Composition\n",
+    "\n",
+    "You can chain multiple iterator types together to create sophisticated data processing pipelines without intermediate storage."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "82a88473",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Example: Sum of squares of even numbers from 1 to 20\n",
+    "def square_if_even(x):\n",
+    "    \"\"\"Square the number if it's even, otherwise return 0\"\"\"\n",
+    "    return (x * x) if (x % 2 == 0) else 0\n",
+    "\n",
+    "def add_op(a, b):\n",
+    "    return a + b\n",
+    "\n",
+    "# Chain operations: generate numbers → filter/square evens → sum\n",
+    "counting_it = comp.CountingIterator(np.int32(1))  # 1, 2, 3, ..., 20\n",
+    "transform_it = comp.TransformIterator(counting_it, square_if_even)  # 0, 4, 0, 16, 0, 36, ...\n",
+    "\n",
+    "d_result = cp.empty(1, dtype=np.int32)\n",
+    "h_init = np.array([0], dtype=np.int32)\n",
+    "\n",
+    "comp.reduce_into(transform_it, d_result, add_op, 20, h_init)\n",
+    "\n",
+    "# Verify: even numbers 2,4,6,8,10,12,14,16,18,20 -> squares 4,16,36,64,100,144,196,256,324,400\n",
+    "evens = [x for x in range(1, 21) if x % 2 == 0]\n",
+    "expected = sum(x * x for x in evens)\n",
+    "\n",
+    "print(f\"Numbers 1-20: even squares sum\")\n",
+    "print(f\"Even numbers: {evens}\")\n",
+    "print(f\"Their squares: {[x*x for x in evens]}\")\n",
+    "print(f\"Iterator result: {d_result.get()[0]}\")\n",
+    "print(f\"Expected result: {expected}\")\n",
+    "print(f\"Correct: {d_result.get()[0] == expected}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8c859995",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "source": [
+    "# 9. Capstone Exercise: Implementing Running Average\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "13f16d8f-76e9-40d6-b9ed-159c68379d02",
+   "metadata": {},
+   "source": [
+    "In this example, you'll implement the running average of a sequence, using a single call to the [inclusive_scan](https://nvidia.github.io/cccl/unstable/python/compute_api.html#cuda.compute.algorithms.inclusive_scan) API. To do this, you'll have to piece together many of the concepts we've learned about so far."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5ee1a1e5-ef83-470d-8991-a2808b8b6ab9",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "@comp.gpu_struct\n",
+    "class SumAndCount:\n",
+    "    # begin TODO\n",
+    "\n",
+    "    # end TODO\n",
+    "\n",
+    "def reduce_op(x, y) -> SumAndCount:\n",
+    "    # begin TODO\n",
+    "\n",
+    "    # end TODO\n",
+    "\n",
+    "def compute_running_average(x: SumAndCount) -> np.float32:\n",
+    "    # begin TODO\n",
+    "\n",
+    "    # end TODO\n",
+    "\n",
+    "d_input = cp.array([2, 3, 5, 1, 7, 6, 8, 4], dtype=np.float32)\n",
+    "d_output = cp.empty(len(d_input), dtype=np.float32)\n",
+    "h_init = SumAndCount(0, 0)\n",
+    "\n",
+    "it_input = comp.ZipIterator(d_input, comp.ConstantIterator(np.int32(1)))\n",
+    "it_output = comp.TransformOutputIterator(d_output, compute_running_average)\n",
+    "\n",
+    "# Perform the reduction.\n",
+    "comp.inclusive_scan(it_input, it_output, reduce_op, h_init, len(d_input))\n",
+    "\n",
+    "print(d_input)\n",
+    "\n",
+    "h_input = d_input.get()\n",
+    "expected = h_input.cumsum() / np.arange(1, len(h_input) + 1)\n",
+    "\n",
+    "print(f\"Running average result: {d_output}\")\n",
+    "np.testing.assert_allclose(d_output.get(), expected)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "55a22e1a",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "source": [
+    "## Resources\n",
+    "\n",
+    "* `cuda-cccl` Documentation: https://nvidia.github.io/cccl/unstable/python/\n",
+    "* `parallel` API Reference: https://nvidia.github.io/cccl/unstable/python/compute_api.html#cuda-compute-api-reference"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
 }
diff --git a/tutorials/accelerated-python/notebooks/libraries/24__nvmath_python__interop.ipynb b/tutorials/accelerated-python/notebooks/libraries/24__nvmath_python__interop.ipynb
index 4d1b174e..00e405c8 100644
--- a/tutorials/accelerated-python/notebooks/libraries/24__nvmath_python__interop.ipynb
+++ b/tutorials/accelerated-python/notebooks/libraries/24__nvmath_python__interop.ipynb
@@ -125,6 +125,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -140,11 +146,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.5"
-  },
-  "colab": {
-   "provenance": [],
-   "include_colab_link": true
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/accelerated-python/notebooks/libraries/25__nvmath_python__kernel_fusion.ipynb b/tutorials/accelerated-python/notebooks/libraries/25__nvmath_python__kernel_fusion.ipynb
index c02406c6..1bf6f78c 100644
--- a/tutorials/accelerated-python/notebooks/libraries/25__nvmath_python__kernel_fusion.ipynb
+++ b/tutorials/accelerated-python/notebooks/libraries/25__nvmath_python__kernel_fusion.ipynb
@@ -236,6 +236,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -251,10 +257,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.5"
-  },
-  "colab": {
-   "provenance": []
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/accelerated-python/notebooks/libraries/26__nvmath_python__stateful_apis.ipynb b/tutorials/accelerated-python/notebooks/libraries/26__nvmath_python__stateful_apis.ipynb
index da4f084c..e46fbf0c 100644
--- a/tutorials/accelerated-python/notebooks/libraries/26__nvmath_python__stateful_apis.ipynb
+++ b/tutorials/accelerated-python/notebooks/libraries/26__nvmath_python__stateful_apis.ipynb
@@ -75,6 +75,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -90,10 +96,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.5"
-  },
-  "colab": {
-   "provenance": []
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/accelerated-python/notebooks/libraries/27__nvmath_python__scaling.ipynb b/tutorials/accelerated-python/notebooks/libraries/27__nvmath_python__scaling.ipynb
index e1b6d08e..861d681e 100644
--- a/tutorials/accelerated-python/notebooks/libraries/27__nvmath_python__scaling.ipynb
+++ b/tutorials/accelerated-python/notebooks/libraries/27__nvmath_python__scaling.ipynb
@@ -200,6 +200,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -215,10 +221,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.5"
-  },
-  "colab": {
-   "provenance": []
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/accelerated-python/notebooks/libraries/28__pynvml.ipynb b/tutorials/accelerated-python/notebooks/libraries/28__pynvml.ipynb
index 460d8fd3..4d4b0289 100644
--- a/tutorials/accelerated-python/notebooks/libraries/28__pynvml.ipynb
+++ b/tutorials/accelerated-python/notebooks/libraries/28__pynvml.ipynb
@@ -180,7 +180,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/accelerated-python/notebooks/libraries/solutions/20__cudf__nyc_parking_violations__SOLUTION.ipynb b/tutorials/accelerated-python/notebooks/libraries/solutions/20__cudf__nyc_parking_violations__SOLUTION.ipynb
index 3b5c4de6..94b670d3 100644
--- a/tutorials/accelerated-python/notebooks/libraries/solutions/20__cudf__nyc_parking_violations__SOLUTION.ipynb
+++ b/tutorials/accelerated-python/notebooks/libraries/solutions/20__cudf__nyc_parking_violations__SOLUTION.ipynb
@@ -1,681 +1,687 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "# DataFrames with Pandas and cuDF"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Table of Contents\n",
-        "\n",
-        "1. [Introduction](#1.-Introduction)\n",
-        "2. [Pandas Essentials](#2.-Pandas-Essentials)\n",
-        "   - [2.1 Series and DataFrame Objects](#2.1-Series-and-DataFrame-Objects)\n",
-        "   - [2.2 Selecting and Filtering Data](#2.2-Selecting-and-Filtering-Data)\n",
-        "   - [2.3 Sorting](#2.3-Sorting)\n",
-        "   - [2.4 Summarizing Data](#2.4-Summarizing-Data)\n",
-        "   - [2.5 Grouped Aggregations (groupby)](#2.5-Grouped-Aggregations-(groupby))\n",
-        "   - [2.6 String Operations](#2.6-String-Operations)\n",
-        "   - [2.7 Time Series](#2.7-Time-Series)\n",
-        "   - [2.8 User-Defined Operations (apply)](#2.8-User-Defined-Operations-(apply))\n",
-        "3. [Enter cuDF: GPU DataFrames](#3.-Enter-cuDF:-GPU-DataFrames)\n",
-        "   - [3.1 Exercise: Date Formatting Failure](#3.1-Exercise:-Date-Formatting-Failure)\n",
-        "   - [3.2 Exercise: Why `.apply()` Breaks Down in cuDF](#3.2-Exercise:-Why-`.apply()`-Breaks-Down-in-cuDF)\n",
-        "4. [Exercise: Analyzing Real Data (NYC Parking Violations)](#4.-Exercise:-Analyzing-Real-Data-(NYC-Parking-Violations))\n",
-        "   - [Step 0: Download Data](#Step-0:-Download-Data)\n",
-        "   - [Task 1: Data Inspection (Pandas)](#📝-Task-1:-Data-Inspection-(Pandas))\n",
-        "   - [Task 2: Analyze Taxis (Pandas)](#📝-Task-2:-Analyze-Taxis-(Pandas))\n",
-        "   - [Task 3: GPU Acceleration (cuDF)](#📝-Task-3:-GPU-Acceleration-(cuDF))\n",
-        "5. [Conclusion](#Conclusion)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## 1. Introduction\n",
-        "\n",
-        "In this notebook, we will build a foundation in data manipulation using **Pandas**, a popular tool for Python data analysis. Then, we will transition to **cuDF**, which allows us to run standard Pandas-like code on the GPU.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Learning Objectives:\n",
-        "\n",
-        "- **Introduce core Pandas operations:** Indexing, Filtering, Aggregating, and Time Series.\n",
-        "- **Learn the subtle differences** (and speed benefits) when porting code to cuDF.\n",
-        "- **Exercise:** Apply these skills to analyze a real-world NYC Parking Violations dataset.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "**Note:** Throughout this notebook, we provide \"Quick Docs\" sections to remind you of common syntax. However, these are not exhaustive. For complete API details, parameters, and edge cases, you should always reference the official [Pandas Documentation](https://pandas.pydata.org/docs/) or the [cuDF Documentation](https://docs.rapids.ai/api/cudf/stable/).\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "---\n",
-        "\n",
-        "## 2. Pandas Essentials\n",
-        "\n",
-        "Before we accelerate with GPUs, let's ensure we are comfortable with the DataFrame API. Even if you are a Pandas pro, this refresher sets the baseline syntax we will replicate later.\n",
-        "\n",
-        "First, import the library:\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "import numpy as np\n",
-        "import pandas as pd"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### 2.1 Series and DataFrame Objects\n",
-        "\n",
-        "- **Series:** A one-dimensional labeled array (like a powerful list or a single column).\n",
-        "- **DataFrame:** A two-dimensional labeled data structure (like a spreadsheet or SQL table).\n",
-        "\n",
-        "**Quick Docs:**\n",
-        "\n",
-        "- `pd.Series(data)`: Create a Series.\n",
-        "- `pd.DataFrame(data, index)`: Create a DataFrame.\n",
-        "- `df.head(n)` / `df.tail(n)`: View the first/last n rows.\n",
-        "- `df.index` / `df.columns`: Access row labels and column names.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# A Series acts like a single column of data\n",
-        "s = pd.Series([10, 20, 30])\n",
-        "print(f\"Max value in series: {s.max()}\")\n",
-        "\n",
-        "# A DataFrame is a collection of Series sharing an index\n",
-        "df = pd.DataFrame({\n",
-        "    \"a\": [1, 2, 1, 3, 2],\n",
-        "    \"b\": [1, 4, 7, 2, 0],\n",
-        "    \"c\": [3, 3, 3, 4, 5]\n",
-        "}, index=[1, 2, 3, 4, 5])\n",
-        "\n",
-        "# View the structure\n",
-        "print(\"Columns:\", df.columns)\n",
-        "df.head(3)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### 2.2 Selecting and Filtering Data\n",
-        "\n",
-        "Selecting specific subsets of data is the most common task in analysis. You can select by column name, label index, or integer position.\n",
-        "\n",
-        "**Quick Docs:**\n",
-        "\n",
-        "- `df['col']`: Select a single column (returns a Series).\n",
-        "- `df[['col1', 'col2']]`: Select multiple columns (returns a DataFrame).\n",
-        "- `df.loc[label]`: Select row(s) by index label.\n",
-        "- `df.iloc[position]`: Select row(s) by integer position (0-based).\n",
-        "- `df[condition]`: Boolean indexing (filtering).\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Select specific columns\n",
-        "subset = df[[\"b\", \"c\"]]\n",
-        "\n",
-        "# Select rows by label (loc) and position (iloc)\n",
-        "row_label_2 = df.loc[2]     # Row with index label 2\n",
-        "row_pos_0 = df.iloc[0]      # First row (physically)\n",
-        "\n",
-        "# Boolean Indexing: Filter rows where column 'a' is greater than 1\n",
-        "filtered_df = df[df['a'] > 1]\n",
-        "filtered_df"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### 2.3 Sorting\n",
-        "\n",
-        "Ordering data helps in ranking and visualization.\n",
-        "\n",
-        "**Quick Docs:**\n",
-        "\n",
-        "- `df.sort_values(by='col', ascending=True/False)`: Sort by one or more columns.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Sort by column 'a' in ascending order\n",
-        "sorted_df = df.sort_values(\"a\")\n",
-        "sorted_df"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### 2.4 Summarizing Data\n",
-        "\n",
-        "It's straightforward to get a quick overview of your data's distribution.\n",
-        "\n",
-        "**Quick Docs:**\n",
-        "\n",
-        "- `df.describe()`: Summary statistics (count, mean, std, etc.).\n",
-        "- `df.mean()`, `df.sum()`, `df.max()`: Aggregations across columns.\n",
-        "- `df['col'].value_counts()`: Count unique values (useful for histograms).\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Calculate the sum of every column\n",
-        "print(\"Sum of columns:\\n\", df.sum())\n",
-        "\n",
-        "# Count frequency of values in column 'a'\n",
-        "print(\"\\nValue counts for 'a':\\n\", df[\"a\"].value_counts())"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### 2.5 Grouped Aggregations (groupby)\n",
-        "\n",
-        "The \"Split-Apply-Combine\" strategy. Split data into groups based on some criteria, apply a function to each group, and combine the results.\n",
-        "\n",
-        "**Quick Docs:**\n",
-        "\n",
-        "- `df.groupby('col')`: Group data.\n",
-        "- `.mean()`, `.count()`: Apply aggregation.\n",
-        "- `.agg({'col': ['min', 'max']})`: Apply complex, specific aggregations.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Group by 'a' and calculate the mean of 'b' and 'c' for each group\n",
-        "grouped_mean = df.groupby(\"a\").mean()\n",
-        "print(grouped_mean)\n",
-        "\n",
-        "# Complex aggregation: Get min and mean of 'b', and max of 'c'\n",
-        "agg_df = df.groupby(\"a\").agg({\n",
-        "    \"b\": [\"min\", \"mean\"],\n",
-        "    \"c\": [\"max\"]\n",
-        "})\n",
-        "agg_df"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### 2.6 String Operations\n",
-        "\n",
-        "Pandas provides vectorized string functions via the `.str` accessor.\n",
-        "\n",
-        "**Quick Docs:**\n",
-        "\n",
-        "- `df['col'].str.upper()`: Convert to uppercase.\n",
-        "- `df['col'].str.contains('pattern')`: Boolean check for substring.\n",
-        "- `df['col'].str.replace('old', 'new')`: Replace text."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Add a string column\n",
-        "df[\"names\"] = [\"mario\", \"luigi\", \"yoshi\", \"peach\", \"toad\"]\n",
-        "\n",
-        "# Convert to uppercase\n",
-        "df[\"names_upper\"] = df[\"names\"].str.upper()\n",
-        "df"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### 2.7 Time Series\n",
-        "\n",
-        "Pandas was originally developed for financial time series analysis. It handles dates and times robustly via the `.dt` accessor.\n",
-        "\n",
-        "**Quick Docs:**\n",
-        "\n",
-        "- `pd.to_datetime()`: Convert strings to datetime objects.\n",
-        "- `df['date'].dt.year`: Extract year component.\n",
-        "- `df['date'].dt.dayofweek`: Extract day of week.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Create a date range\n",
-        "date_df = pd.DataFrame()\n",
-        "date_df[\"date\"] = pd.date_range(\"2018-11-20\", periods=5, freq=\"D\")\n",
-        "date_df[\"value\"] = np.random.sample(len(date_df))\n",
-        "\n",
-        "# Filter by date\n",
-        "subset_dates = date_df[date_df[\"date\"] < \"2018-11-23\"]\n",
-        "\n",
-        "# Extract features\n",
-        "date_df[\"year\"] = date_df[\"date\"].dt.year\n",
-        "date_df"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### 2.8 User-Defined Operations (apply)\n",
-        "\n",
-        "When built-in functions aren't enough, you can apply custom Python functions.\n",
-        "\n",
-        "**Quick Docs:**\n",
-        "\n",
-        "- `df['col'].apply(func)`: Apply function `func` to every element.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "def add_ten(x):\n",
-        "    return x + 10\n",
-        "\n",
-        "# Apply the custom function\n",
-        "df[\"a_plus_10\"] = df[\"a\"].apply(add_ten)\n",
-        "df.head(2)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "---\n",
-        "\n",
-        "## 3. Enter cuDF: GPU DataFrames\n",
-        "\n",
-        "cuDF mimics the Pandas API but runs on the GPU. The transition is often as simple as changing the import, but there are some constraints you must know.\n",
-        "\n",
-        "First, let's create a GPU DataFrame.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "import cudf\n",
-        "\n",
-        "# Create a cuDF DataFrame (data resides on GPU)\n",
-        "gdf = cudf.DataFrame({\n",
-        "    \"a\": [1, 2, 1, 3, 2],\n",
-        "    \"b\": [1, 4, 7, 2, 0],\n",
-        "    \"c\": [1, 1, 8, 2, 9]\n",
-        "}, index=[1, 2, 3, 4, 5])\n",
-        "\n",
-        "# Operations work exactly the same!\n",
-        "print(type(gdf))\n",
-        "gdf.groupby(\"a\").mean()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### 3.1 Exercise: Date Formatting Failure\n",
-        "\n",
-        "Pandas is very forgiving with date formats. cuDF is stricter. Run the cell below to see what happens when you use a non-standard date string.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# EXECUTE THIS CELL TO SEE THE ERROR\n",
-        "try:\n",
-        "    date_df = cudf.DataFrame()\n",
-        "    # Pandas handles \"11/20/2018\" easily. Does cuDF?\n",
-        "    date_df[\"date\"] = cudf.date_range(\"11/20/2018\", periods=72, freq=\"D\")\n",
-        "except Exception as e:\n",
-        "    print(f\"Error caught: {e}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "**Why did this fail?** Unlike Pandas, cuDF currently requires ISO-standard date formats (Year-Month-Day) for creating date ranges.\n",
-        "\n",
-        "- **Pandas:** Guesses `11/20/2018` is Nov 20th.\n",
-        "- **cuDF:** Requires `2018-11-20`.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### 3.2 Exercise: Why `.apply()` Breaks Down in cuDF\n",
-        "\n",
-        "In Pandas, `.apply()` works by executing Python-level code over the data, which makes it very flexible but CPU-centric. On the GPU, this model does not translate directly: GPUs cannot execute arbitrary Python bytecode. In cuDF, `.apply()`-style user-defined functions must be JIT-compiled with Numba for the CUDA target, which imposes constraints:\n",
-        "- The function must be Numba-compilable (no general Python objects or unsupported features).\n",
-        "- Only operations that Numba can lower to GPU device code are allowed.\n",
-        "- Code must follow a more restricted, compilation-friendly execution model.\n",
-        "\n",
-        "Note that cuDF UDFs are not limited to pure math. Some string operations and well-defined null-handling patterns are supported. See the [cuDF apply() documentation](https://docs.rapids.ai/api/cudf/latest/user_guide/api_docs/api/cudf.dataframe.apply/) for the full set of supported features and limitations.\n",
-        "\n",
-        "Even simple-looking Python functions often fall outside these constraints:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# A function that looks innocent but is NOT GPU-safe\n",
-        "def add_ten_verbose(x):\n",
-        "    # Python branching + dynamic typing make this un-compilable for the GPU\n",
-        "    if isinstance(x, (int, float)):\n",
-        "        return x + 10\n",
-        "    else:\n",
-        "        return x"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "This is perfectly legal in Pandas. But in cuDF, Numba cannot:\n",
-        "\n",
-        "- interpret `isinstance`\n",
-        "- handle Python branching on object types\n",
-        "- JIT-compile dynamic return values\n",
-        "\n",
-        "Now try running it:\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Execute this cell to observe the cuDF limitation\n",
-        "try:\n",
-        "    gdf[\"a\"] = gdf[\"a\"].apply(add_ten_verbose)\n",
-        "except Exception as e:\n",
-        "    print(\"cuDF apply() constraint caught:\")\n",
-        "    print(e)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "**What happens?** cuDF attempts to compile the function → compilation fails → you get a runtime error. This mirrors real-world failure modes: anything that is not pure numerical logic will break.\n",
-        "\n",
-        "Here is the same logic, rewritten in a way the GPU can compile:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# GPU-safe version: no Python, no branching, pure math\n",
-        "def add_ten_gpu(x):\n",
-        "    return x + 10\n",
-        "try:\n",
-        "    gdf[\"a\"] = gdf[\"a\"].apply(add_ten_gpu)\n",
-        "except Exception as e:\n",
-        "    print(\"cuDF apply() constraint caught:\")\n",
-        "    print(e)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "This is one of the few forms that Numba can translate. But even this version is not preferred. **cuDF `.apply()` is a last resort**. Even if your function compiles, `.apply()` still triggers:\n",
-        "\n",
-        "- JIT compilation overhead (slow startup)\n",
-        "- Kernel launch overhead\n",
-        "- Reduced optimization compared to built-in GPU operations\n",
-        "\n",
-        "For typical column transformations, this is simply unnecessary. \n",
-        "**Best practice is to always use vectorized operations:**\n",
-        "\n",
-        "```python\n",
-        "gdf[\"a\"] + 10\n",
-        "```\n",
-        "The vectorized version is:\n",
-        "- faster\n",
-        "- simpler\n",
-        "- more readable\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "---\n",
-        "\n",
-        "## 4. Exercise: Analyzing Real Data (NYC Parking Violations)\n",
-        "\n",
-        "Now you will apply what you learned to a large, real-world dataset.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Step 0: Download Data\n",
-        "\n",
-        "We will fetch a subset of the NYC Parking Violations dataset (Fiscal Year 2022).\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "![ -f nyc_parking_violations_2022.parquet ] || curl -fsSL -o nyc_parking_violations_2022.parquet https://data.rapids.ai/datasets/nyc_parking/nyc_parking_violations_2022.parquet"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Task 1: Data Inspection (Pandas)\n",
-        "\n",
-        "**Goal:** Load the data and inspect its structure.\n",
-        "\n",
-        "**Instructions:**\n",
-        "\n",
-        "1. Read the file `nyc_parking_violations_2022.parquet` into a Pandas DataFrame.\n",
-        "2. Print the columns.\n",
-        "3. Create a subset DataFrame with only: `Registration State`, `Violation Description`, `Vehicle Body Type`, `Issue Date`.\n",
-        "4. Display the head of this subset.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "import pandas as pd\n",
-        "\n",
-        "# Read parquet file\n",
-        "df = pd.read_parquet(\"nyc_parking_violations_2022.parquet\")\n",
-        "\n",
-        "# Print columns\n",
-        "print(df.columns)\n",
-        "\n",
-        "# Select specific columns\n",
-        "df_subset = df[[\"Registration State\", \"Violation Description\", \"Vehicle Body Type\", \"Issue Date\"]]\n",
-        "\n",
-        "# Display head\n",
-        "df_subset.head()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Task 2: Analyze Taxis (Pandas)\n",
-        "\n",
-        "**Goal:** Filter, Group, and Count.\n",
-        "\n",
-        "**Instructions:**\n",
-        "\n",
-        "1. Filter the DataFrame to find rows where `Vehicle Body Type` is `\"TAXI\"`.\n",
-        "2. Group by `Registration State`.\n",
-        "3. Count the occurrences to see which states the taxis are registered in.\n",
-        "4. Sort the results descending to find the top states.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "%%time\n",
-        "\n",
-        "# Filter for TAXI\n",
-        "taxi_df = df_subset[df_subset[\"Vehicle Body Type\"] == \"TAXI\"]\n",
-        "\n",
-        "# Group by State and count\n",
-        "state_counts = taxi_df.groupby(\"Registration State\").size()\n",
-        "\n",
-        "# Sort and display top results\n",
-        "state_counts.sort_values(ascending=False)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Task 3: GPU Acceleration (cuDF)\n",
-        "\n",
-        "**Goal:** Measure the speedup.\n",
-        "\n",
-        "**Instructions:**\n",
-        "\n",
-        "1. Import `cudf`.\n",
-        "2. Use `%%time` at the top of the cell.\n",
-        "3. Replicate the entire pipeline (Read -> Filter columns -> Filter Rows -> Group -> Sort) using `cudf`.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "%%time\n",
-        "import cudf\n",
-        "\n",
-        "# Read parquet file into cuDF DataFrame\n",
-        "gdf = cudf.read_parquet(\"nyc_parking_violations_2022.parquet\")\n",
-        "\n",
-        "# Select specific columns (Filter columns)\n",
-        "gdf_subset = gdf[[\"Registration State\", \"Violation Description\", \"Vehicle Body Type\", \"Issue Date\"]]\n",
-        "\n",
-        "# Filter for TAXI (Filter Rows)\n",
-        "taxi_gdf = gdf_subset[gdf_subset[\"Vehicle Body Type\"] == \"TAXI\"]\n",
-        "\n",
-        "# Group by State and count\n",
-        "state_counts_gpu = taxi_gdf.groupby(\"Registration State\").size()\n",
-        "\n",
-        "# Sort and display top results\n",
-        "state_counts_gpu.sort_values(ascending=False)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "---\n",
-        "\n",
-        "## Conclusion\n",
-        "\n",
-        "Compare the **Wall time** of Task 2 vs Task 3. You should see a significant performance improvement with cuDF, especially as data size grows!\n"
-      ]
-    }
-  ],
-  "metadata": {
-    "kernelspec": {
-      "display_name": "Python 3 (RAPIDS 25.10)",
-      "language": "python",
-      "name": "cudf-cu13-25.10"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.13.5"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 2
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# DataFrames with Pandas and cuDF"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Table of Contents\n",
+    "\n",
+    "1. [Introduction](#1.-Introduction)\n",
+    "2. [Pandas Essentials](#2.-Pandas-Essentials)\n",
+    "   - [2.1 Series and DataFrame Objects](#2.1-Series-and-DataFrame-Objects)\n",
+    "   - [2.2 Selecting and Filtering Data](#2.2-Selecting-and-Filtering-Data)\n",
+    "   - [2.3 Sorting](#2.3-Sorting)\n",
+    "   - [2.4 Summarizing Data](#2.4-Summarizing-Data)\n",
+    "   - [2.5 Grouped Aggregations (groupby)](#2.5-Grouped-Aggregations-(groupby))\n",
+    "   - [2.6 String Operations](#2.6-String-Operations)\n",
+    "   - [2.7 Time Series](#2.7-Time-Series)\n",
+    "   - [2.8 User-Defined Operations (apply)](#2.8-User-Defined-Operations-(apply))\n",
+    "3. [Enter cuDF: GPU DataFrames](#3.-Enter-cuDF:-GPU-DataFrames)\n",
+    "   - [3.1 Exercise: Date Formatting Failure](#3.1-Exercise:-Date-Formatting-Failure)\n",
+    "   - [3.2 Exercise: Why `.apply()` Breaks Down in cuDF](#3.2-Exercise:-Why-`.apply()`-Breaks-Down-in-cuDF)\n",
+    "4. [Exercise: Analyzing Real Data (NYC Parking Violations)](#4.-Exercise:-Analyzing-Real-Data-(NYC-Parking-Violations))\n",
+    "   - [Step 0: Download Data](#Step-0:-Download-Data)\n",
+    "   - [Task 1: Data Inspection (Pandas)](#📝-Task-1:-Data-Inspection-(Pandas))\n",
+    "   - [Task 2: Analyze Taxis (Pandas)](#📝-Task-2:-Analyze-Taxis-(Pandas))\n",
+    "   - [Task 3: GPU Acceleration (cuDF)](#📝-Task-3:-GPU-Acceleration-(cuDF))\n",
+    "5. [Conclusion](#Conclusion)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Introduction\n",
+    "\n",
+    "In this notebook, we will build a foundation in data manipulation using **Pandas**, a popular tool for Python data analysis. Then, we will transition to **cuDF**, which allows us to run standard Pandas-like code on the GPU.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Learning Objectives:\n",
+    "\n",
+    "- **Introduce core Pandas operations:** Indexing, Filtering, Aggregating, and Time Series.\n",
+    "- **Learn the subtle differences** (and speed benefits) when porting code to cuDF.\n",
+    "- **Exercise:** Apply these skills to analyze a real-world NYC Parking Violations dataset.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Note:** Throughout this notebook, we provide \"Quick Docs\" sections to remind you of common syntax. However, these are not exhaustive. For complete API details, parameters, and edge cases, you should always reference the official [Pandas Documentation](https://pandas.pydata.org/docs/) or the [cuDF Documentation](https://docs.rapids.ai/api/cudf/stable/).\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "## 2. Pandas Essentials\n",
+    "\n",
+    "Before we accelerate with GPUs, let's ensure we are comfortable with the DataFrame API. Even if you are a Pandas pro, this refresher sets the baseline syntax we will replicate later.\n",
+    "\n",
+    "First, import the library:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.1 Series and DataFrame Objects\n",
+    "\n",
+    "- **Series:** A one-dimensional labeled array (like a powerful list or a single column).\n",
+    "- **DataFrame:** A two-dimensional labeled data structure (like a spreadsheet or SQL table).\n",
+    "\n",
+    "**Quick Docs:**\n",
+    "\n",
+    "- `pd.Series(data)`: Create a Series.\n",
+    "- `pd.DataFrame(data, index)`: Create a DataFrame.\n",
+    "- `df.head(n)` / `df.tail(n)`: View the first/last n rows.\n",
+    "- `df.index` / `df.columns`: Access row labels and column names.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# A Series acts like a single column of data\n",
+    "s = pd.Series([10, 20, 30])\n",
+    "print(f\"Max value in series: {s.max()}\")\n",
+    "\n",
+    "# A DataFrame is a collection of Series sharing an index\n",
+    "df = pd.DataFrame({\n",
+    "    \"a\": [1, 2, 1, 3, 2],\n",
+    "    \"b\": [1, 4, 7, 2, 0],\n",
+    "    \"c\": [3, 3, 3, 4, 5]\n",
+    "}, index=[1, 2, 3, 4, 5])\n",
+    "\n",
+    "# View the structure\n",
+    "print(\"Columns:\", df.columns)\n",
+    "df.head(3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.2 Selecting and Filtering Data\n",
+    "\n",
+    "Selecting specific subsets of data is the most common task in analysis. You can select by column name, label index, or integer position.\n",
+    "\n",
+    "**Quick Docs:**\n",
+    "\n",
+    "- `df['col']`: Select a single column (returns a Series).\n",
+    "- `df[['col1', 'col2']]`: Select multiple columns (returns a DataFrame).\n",
+    "- `df.loc[label]`: Select row(s) by index label.\n",
+    "- `df.iloc[position]`: Select row(s) by integer position (0-based).\n",
+    "- `df[condition]`: Boolean indexing (filtering).\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Select specific columns\n",
+    "subset = df[[\"b\", \"c\"]]\n",
+    "\n",
+    "# Select rows by label (loc) and position (iloc)\n",
+    "row_label_2 = df.loc[2]     # Row with index label 2\n",
+    "row_pos_0 = df.iloc[0]      # First row (physically)\n",
+    "\n",
+    "# Boolean Indexing: Filter rows where column 'a' is greater than 1\n",
+    "filtered_df = df[df['a'] > 1]\n",
+    "filtered_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.3 Sorting\n",
+    "\n",
+    "Ordering data helps in ranking and visualization.\n",
+    "\n",
+    "**Quick Docs:**\n",
+    "\n",
+    "- `df.sort_values(by='col', ascending=True/False)`: Sort by one or more columns.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Sort by column 'a' in ascending order\n",
+    "sorted_df = df.sort_values(\"a\")\n",
+    "sorted_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.4 Summarizing Data\n",
+    "\n",
+    "It's straightforward to get a quick overview of your data's distribution.\n",
+    "\n",
+    "**Quick Docs:**\n",
+    "\n",
+    "- `df.describe()`: Summary statistics (count, mean, std, etc.).\n",
+    "- `df.mean()`, `df.sum()`, `df.max()`: Aggregations across columns.\n",
+    "- `df['col'].value_counts()`: Count unique values (useful for histograms).\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Calculate the sum of every column\n",
+    "print(\"Sum of columns:\\n\", df.sum())\n",
+    "\n",
+    "# Count frequency of values in column 'a'\n",
+    "print(\"\\nValue counts for 'a':\\n\", df[\"a\"].value_counts())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.5 Grouped Aggregations (groupby)\n",
+    "\n",
+    "The \"Split-Apply-Combine\" strategy. Split data into groups based on some criteria, apply a function to each group, and combine the results.\n",
+    "\n",
+    "**Quick Docs:**\n",
+    "\n",
+    "- `df.groupby('col')`: Group data.\n",
+    "- `.mean()`, `.count()`: Apply aggregation.\n",
+    "- `.agg({'col': ['min', 'max']})`: Apply complex, specific aggregations.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Group by 'a' and calculate the mean of 'b' and 'c' for each group\n",
+    "grouped_mean = df.groupby(\"a\").mean()\n",
+    "print(grouped_mean)\n",
+    "\n",
+    "# Complex aggregation: Get min and mean of 'b', and max of 'c'\n",
+    "agg_df = df.groupby(\"a\").agg({\n",
+    "    \"b\": [\"min\", \"mean\"],\n",
+    "    \"c\": [\"max\"]\n",
+    "})\n",
+    "agg_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.6 String Operations\n",
+    "\n",
+    "Pandas provides vectorized string functions via the `.str` accessor.\n",
+    "\n",
+    "**Quick Docs:**\n",
+    "\n",
+    "- `df['col'].str.upper()`: Convert to uppercase.\n",
+    "- `df['col'].str.contains('pattern')`: Boolean check for substring.\n",
+    "- `df['col'].str.replace('old', 'new')`: Replace text."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add a string column\n",
+    "df[\"names\"] = [\"mario\", \"luigi\", \"yoshi\", \"peach\", \"toad\"]\n",
+    "\n",
+    "# Convert to uppercase\n",
+    "df[\"names_upper\"] = df[\"names\"].str.upper()\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.7 Time Series\n",
+    "\n",
+    "Pandas was originally developed for financial time series analysis. It handles dates and times robustly via the `.dt` accessor.\n",
+    "\n",
+    "**Quick Docs:**\n",
+    "\n",
+    "- `pd.to_datetime()`: Convert strings to datetime objects.\n",
+    "- `df['date'].dt.year`: Extract year component.\n",
+    "- `df['date'].dt.dayofweek`: Extract day of week.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a date range\n",
+    "date_df = pd.DataFrame()\n",
+    "date_df[\"date\"] = pd.date_range(\"2018-11-20\", periods=5, freq=\"D\")\n",
+    "date_df[\"value\"] = np.random.sample(len(date_df))\n",
+    "\n",
+    "# Filter by date\n",
+    "subset_dates = date_df[date_df[\"date\"] < \"2018-11-23\"]\n",
+    "\n",
+    "# Extract features\n",
+    "date_df[\"year\"] = date_df[\"date\"].dt.year\n",
+    "date_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.8 User-Defined Operations (apply)\n",
+    "\n",
+    "When built-in functions aren't enough, you can apply custom Python functions.\n",
+    "\n",
+    "**Quick Docs:**\n",
+    "\n",
+    "- `df['col'].apply(func)`: Apply function `func` to every element.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def add_ten(x):\n",
+    "    return x + 10\n",
+    "\n",
+    "# Apply the custom function\n",
+    "df[\"a_plus_10\"] = df[\"a\"].apply(add_ten)\n",
+    "df.head(2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "## 3. Enter cuDF: GPU DataFrames\n",
+    "\n",
+    "cuDF mimics the Pandas API but runs on the GPU. The transition is often as simple as changing the import, but there are some constraints you must know.\n",
+    "\n",
+    "First, let's create a GPU DataFrame.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudf\n",
+    "\n",
+    "# Create a cuDF DataFrame (data resides on GPU)\n",
+    "gdf = cudf.DataFrame({\n",
+    "    \"a\": [1, 2, 1, 3, 2],\n",
+    "    \"b\": [1, 4, 7, 2, 0],\n",
+    "    \"c\": [1, 1, 8, 2, 9]\n",
+    "}, index=[1, 2, 3, 4, 5])\n",
+    "\n",
+    "# Operations work exactly the same!\n",
+    "print(type(gdf))\n",
+    "gdf.groupby(\"a\").mean()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.1 Exercise: Date Formatting Failure\n",
+    "\n",
+    "Pandas is very forgiving with date formats. cuDF is stricter. Run the cell below to see what happens when you use a non-standard date string.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# EXECUTE THIS CELL TO SEE THE ERROR\n",
+    "try:\n",
+    "    date_df = cudf.DataFrame()\n",
+    "    # Pandas handles \"11/20/2018\" easily. Does cuDF?\n",
+    "    date_df[\"date\"] = cudf.date_range(\"11/20/2018\", periods=72, freq=\"D\")\n",
+    "except Exception as e:\n",
+    "    print(f\"Error caught: {e}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Why did this fail?** Unlike Pandas, cuDF currently requires ISO-standard date formats (Year-Month-Day) for creating date ranges.\n",
+    "\n",
+    "- **Pandas:** Guesses `11/20/2018` is Nov 20th.\n",
+    "- **cuDF:** Requires `2018-11-20`.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.2 Exercise: Why `.apply()` Breaks Down in cuDF\n",
+    "\n",
+    "In Pandas, `.apply()` works by executing Python-level code over the data, which makes it very flexible but CPU-centric. On the GPU, this model does not translate directly: GPUs cannot execute arbitrary Python bytecode. In cuDF, `.apply()`-style user-defined functions must be JIT-compiled with Numba for the CUDA target, which imposes constraints:\n",
+    "- The function must be Numba-compilable (no general Python objects or unsupported features).\n",
+    "- Only operations that Numba can lower to GPU device code are allowed.\n",
+    "- Code must follow a more restricted, compilation-friendly execution model.\n",
+    "\n",
+    "Note that cuDF UDFs are not limited to pure math. Some string operations and well-defined null-handling patterns are supported. See the [cuDF apply() documentation](https://docs.rapids.ai/api/cudf/latest/user_guide/api_docs/api/cudf.dataframe.apply/) for the full set of supported features and limitations.\n",
+    "\n",
+    "Even simple-looking Python functions often fall outside these constraints:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# A function that looks innocent but is NOT GPU-safe\n",
+    "def add_ten_verbose(x):\n",
+    "    # Python branching + dynamic typing make this un-compilable for the GPU\n",
+    "    if isinstance(x, (int, float)):\n",
+    "        return x + 10\n",
+    "    else:\n",
+    "        return x"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is perfectly legal in Pandas. But in cuDF, Numba cannot:\n",
+    "\n",
+    "- interpret `isinstance`\n",
+    "- handle Python branching on object types\n",
+    "- JIT-compile dynamic return values\n",
+    "\n",
+    "Now try running it:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Execute this cell to observe the cuDF limitation\n",
+    "try:\n",
+    "    gdf[\"a\"] = gdf[\"a\"].apply(add_ten_verbose)\n",
+    "except Exception as e:\n",
+    "    print(\"cuDF apply() constraint caught:\")\n",
+    "    print(e)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**What happens?** cuDF attempts to compile the function → compilation fails → you get a runtime error. This mirrors real-world failure modes: anything that is not pure numerical logic will break.\n",
+    "\n",
+    "Here is the same logic, rewritten in a way the GPU can compile:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# GPU-safe version: no Python, no branching, pure math\n",
+    "def add_ten_gpu(x):\n",
+    "    return x + 10\n",
+    "try:\n",
+    "    gdf[\"a\"] = gdf[\"a\"].apply(add_ten_gpu)\n",
+    "except Exception as e:\n",
+    "    print(\"cuDF apply() constraint caught:\")\n",
+    "    print(e)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is one of the few forms that Numba can translate. But even this version is not preferred. **cuDF `.apply()` is a last resort**. Even if your function compiles, `.apply()` still triggers:\n",
+    "\n",
+    "- JIT compilation overhead (slow startup)\n",
+    "- Kernel launch overhead\n",
+    "- Reduced optimization compared to built-in GPU operations\n",
+    "\n",
+    "For typical column transformations, this is simply unnecessary. \n",
+    "**Best practice is to always use vectorized operations:**\n",
+    "\n",
+    "```python\n",
+    "gdf[\"a\"] + 10\n",
+    "```\n",
+    "The vectorized version is:\n",
+    "- faster\n",
+    "- simpler\n",
+    "- more readable\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "## 4. Exercise: Analyzing Real Data (NYC Parking Violations)\n",
+    "\n",
+    "Now you will apply what you learned to a large, real-world dataset.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 0: Download Data\n",
+    "\n",
+    "We will fetch a subset of the NYC Parking Violations dataset (Fiscal Year 2022).\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "![ -f nyc_parking_violations_2022.parquet ] || curl -fsSL -o nyc_parking_violations_2022.parquet https://data.rapids.ai/datasets/nyc_parking/nyc_parking_violations_2022.parquet"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Task 1: Data Inspection (Pandas)\n",
+    "\n",
+    "**Goal:** Load the data and inspect its structure.\n",
+    "\n",
+    "**Instructions:**\n",
+    "\n",
+    "1. Read the file `nyc_parking_violations_2022.parquet` into a Pandas DataFrame.\n",
+    "2. Print the columns.\n",
+    "3. Create a subset DataFrame with only: `Registration State`, `Violation Description`, `Vehicle Body Type`, `Issue Date`.\n",
+    "4. Display the head of this subset.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# Read parquet file\n",
+    "df = pd.read_parquet(\"nyc_parking_violations_2022.parquet\")\n",
+    "\n",
+    "# Print columns\n",
+    "print(df.columns)\n",
+    "\n",
+    "# Select specific columns\n",
+    "df_subset = df[[\"Registration State\", \"Violation Description\", \"Vehicle Body Type\", \"Issue Date\"]]\n",
+    "\n",
+    "# Display head\n",
+    "df_subset.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Task 2: Analyze Taxis (Pandas)\n",
+    "\n",
+    "**Goal:** Filter, Group, and Count.\n",
+    "\n",
+    "**Instructions:**\n",
+    "\n",
+    "1. Filter the DataFrame to find rows where `Vehicle Body Type` is `\"TAXI\"`.\n",
+    "2. Group by `Registration State`.\n",
+    "3. Count the occurrences to see which states the taxis are registered in.\n",
+    "4. Sort the results descending to find the top states.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "# Filter for TAXI\n",
+    "taxi_df = df_subset[df_subset[\"Vehicle Body Type\"] == \"TAXI\"]\n",
+    "\n",
+    "# Group by State and count\n",
+    "state_counts = taxi_df.groupby(\"Registration State\").size()\n",
+    "\n",
+    "# Sort and display top results\n",
+    "state_counts.sort_values(ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Task 3: GPU Acceleration (cuDF)\n",
+    "\n",
+    "**Goal:** Measure the speedup.\n",
+    "\n",
+    "**Instructions:**\n",
+    "\n",
+    "1. Import `cudf`.\n",
+    "2. Use `%%time` at the top of the cell.\n",
+    "3. Replicate the entire pipeline (Read -> Filter columns -> Filter Rows -> Group -> Sort) using `cudf`.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "import cudf\n",
+    "\n",
+    "# Read parquet file into cuDF DataFrame\n",
+    "gdf = cudf.read_parquet(\"nyc_parking_violations_2022.parquet\")\n",
+    "\n",
+    "# Select specific columns (Filter columns)\n",
+    "gdf_subset = gdf[[\"Registration State\", \"Violation Description\", \"Vehicle Body Type\", \"Issue Date\"]]\n",
+    "\n",
+    "# Filter for TAXI (Filter Rows)\n",
+    "taxi_gdf = gdf_subset[gdf_subset[\"Vehicle Body Type\"] == \"TAXI\"]\n",
+    "\n",
+    "# Group by State and count\n",
+    "state_counts_gpu = taxi_gdf.groupby(\"Registration State\").size()\n",
+    "\n",
+    "# Sort and display top results\n",
+    "state_counts_gpu.sort_values(ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "## Conclusion\n",
+    "\n",
+    "Compare the **Wall time** of Task 2 vs Task 3. You should see a significant performance improvement with cuDF, especially as data size grows!\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (RAPIDS 25.10)",
+   "language": "python",
+   "name": "cudf-cu13-25.10"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
 }
diff --git a/tutorials/accelerated-python/notebooks/libraries/solutions/21__cudf_pandas__nyc_parking_violations__SOLUTION.ipynb b/tutorials/accelerated-python/notebooks/libraries/solutions/21__cudf_pandas__nyc_parking_violations__SOLUTION.ipynb
index 0927a480..d5563685 100644
--- a/tutorials/accelerated-python/notebooks/libraries/solutions/21__cudf_pandas__nyc_parking_violations__SOLUTION.ipynb
+++ b/tutorials/accelerated-python/notebooks/libraries/solutions/21__cudf_pandas__nyc_parking_violations__SOLUTION.ipynb
@@ -60,6 +60,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (RAPIDS 25.10)",
    "language": "python",
@@ -75,11 +81,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  },
-  "colab": {
-   "provenance": [],
-   "gpuType": "T4"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/accelerated-python/notebooks/libraries/solutions/23__cuda_cccl__customizing_algorithms__SOLUTION.ipynb b/tutorials/accelerated-python/notebooks/libraries/solutions/23__cuda_cccl__customizing_algorithms__SOLUTION.ipynb
index 8024bf4e..000fc522 100644
--- a/tutorials/accelerated-python/notebooks/libraries/solutions/23__cuda_cccl__customizing_algorithms__SOLUTION.ipynb
+++ b/tutorials/accelerated-python/notebooks/libraries/solutions/23__cuda_cccl__customizing_algorithms__SOLUTION.ipynb
@@ -159,9 +159,11 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
   "colab": {
    "gpuType": "T4",
-   "provenance": []
+   "provenance": [],
+   "toc_visible": true
   },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
@@ -178,7 +180,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.5"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/accelerated-python/notebooks/start.ipynb b/tutorials/accelerated-python/notebooks/start.ipynb
index 2597a0b2..ac1ecd7c 100644
--- a/tutorials/accelerated-python/notebooks/start.ipynb
+++ b/tutorials/accelerated-python/notebooks/start.ipynb
@@ -16,9 +16,11 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
   "colab": {
    "gpuType": "T4",
-   "provenance": []
+   "provenance": [],
+   "toc_visible": true
   },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
@@ -26,7 +28,16 @@
    "name": "python3"
   },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/accelerated-python/notebooks/syllabi/cuda_python__cuda_core_cccl__2_hours.ipynb b/tutorials/accelerated-python/notebooks/syllabi/cuda_python__cuda_core_cccl__2_hours.ipynb
index 4748a021..4bf89062 100644
--- a/tutorials/accelerated-python/notebooks/syllabi/cuda_python__cuda_core_cccl__2_hours.ipynb
+++ b/tutorials/accelerated-python/notebooks/syllabi/cuda_python__cuda_core_cccl__2_hours.ipynb
@@ -36,9 +36,11 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
   "colab": {
    "gpuType": "T4",
-   "provenance": []
+   "provenance": [],
+   "toc_visible": true
   },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
@@ -46,7 +48,16 @@
    "name": "python3"
   },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/accelerated-python/notebooks/syllabi/cuda_python__cupy_cudf_cccl_kernels__8_hours.ipynb b/tutorials/accelerated-python/notebooks/syllabi/cuda_python__cupy_cudf_cccl_kernels__8_hours.ipynb
index 9d4a89f9..ece2a266 100644
--- a/tutorials/accelerated-python/notebooks/syllabi/cuda_python__cupy_cudf_cccl_kernels__8_hours.ipynb
+++ b/tutorials/accelerated-python/notebooks/syllabi/cuda_python__cupy_cudf_cccl_kernels__8_hours.ipynb
@@ -51,9 +51,11 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
   "colab": {
    "gpuType": "T4",
-   "provenance": []
+   "provenance": [],
+   "toc_visible": true
   },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
@@ -61,7 +63,16 @@
    "name": "python3"
   },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/accelerated-python/notebooks/syllabi/pyhpc__numpy_cupy_mpi4py__4_hours.ipynb b/tutorials/accelerated-python/notebooks/syllabi/pyhpc__numpy_cupy_mpi4py__4_hours.ipynb
index bcf08bf4..31b36332 100644
--- a/tutorials/accelerated-python/notebooks/syllabi/pyhpc__numpy_cupy_mpi4py__4_hours.ipynb
+++ b/tutorials/accelerated-python/notebooks/syllabi/pyhpc__numpy_cupy_mpi4py__4_hours.ipynb
@@ -37,9 +37,11 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
   "colab": {
    "gpuType": "T4",
-   "provenance": []
+   "provenance": [],
+   "toc_visible": true
   },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
@@ -47,7 +49,16 @@
    "name": "python3"
   },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/cuda-cpp/notebooks/01.01-Introduction/01.01.01-CUDA-Made-Easy.ipynb b/tutorials/cuda-cpp/notebooks/01.01-Introduction/01.01.01-CUDA-Made-Easy.ipynb
index 63b6c32c..708d2a47 100644
--- a/tutorials/cuda-cpp/notebooks/01.01-Introduction/01.01.01-CUDA-Made-Easy.ipynb
+++ b/tutorials/cuda-cpp/notebooks/01.01-Introduction/01.01.01-CUDA-Made-Easy.ipynb
@@ -112,9 +112,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/01.02-Execution-Spaces/01.02.01-Execution-Spaces.ipynb b/tutorials/cuda-cpp/notebooks/01.02-Execution-Spaces/01.02.01-Execution-Spaces.ipynb
index fda89389..eccd3ae2 100644
--- a/tutorials/cuda-cpp/notebooks/01.02-Execution-Spaces/01.02.01-Execution-Spaces.ipynb
+++ b/tutorials/cuda-cpp/notebooks/01.02-Execution-Spaces/01.02.01-Execution-Spaces.ipynb
@@ -332,7 +332,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -346,9 +346,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/01.02-Execution-Spaces/01.02.02-Exercise-Annotate-Execution-Spaces.ipynb b/tutorials/cuda-cpp/notebooks/01.02-Execution-Spaces/01.02.02-Exercise-Annotate-Execution-Spaces.ipynb
index 7c1003ff..6767aade 100644
--- a/tutorials/cuda-cpp/notebooks/01.02-Execution-Spaces/01.02.02-Exercise-Annotate-Execution-Spaces.ipynb
+++ b/tutorials/cuda-cpp/notebooks/01.02-Execution-Spaces/01.02.02-Exercise-Annotate-Execution-Spaces.ipynb
@@ -128,7 +128,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -142,9 +142,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/01.02-Execution-Spaces/01.02.03-Exercise-Changing-Execution-Space.ipynb b/tutorials/cuda-cpp/notebooks/01.02-Execution-Spaces/01.02.03-Exercise-Changing-Execution-Space.ipynb
index c8fecd19..6a98abf6 100644
--- a/tutorials/cuda-cpp/notebooks/01.02-Execution-Spaces/01.02.03-Exercise-Changing-Execution-Space.ipynb
+++ b/tutorials/cuda-cpp/notebooks/01.02-Execution-Spaces/01.02.03-Exercise-Changing-Execution-Space.ipynb
@@ -117,7 +117,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -131,9 +131,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/01.02-Execution-Spaces/01.02.04-Exercise-Compute-Median-Temperature.ipynb b/tutorials/cuda-cpp/notebooks/01.02-Execution-Spaces/01.02.04-Exercise-Compute-Median-Temperature.ipynb
index 2c34724d..0bef2dbc 100644
--- a/tutorials/cuda-cpp/notebooks/01.02-Execution-Spaces/01.02.04-Exercise-Compute-Median-Temperature.ipynb
+++ b/tutorials/cuda-cpp/notebooks/01.02-Execution-Spaces/01.02.04-Exercise-Compute-Median-Temperature.ipynb
@@ -144,7 +144,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -158,9 +158,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/01.03-Extending-Algorithms/01.03.01-Extending-Algorithms.ipynb b/tutorials/cuda-cpp/notebooks/01.03-Extending-Algorithms/01.03.01-Extending-Algorithms.ipynb
index ff930869..2b50542c 100644
--- a/tutorials/cuda-cpp/notebooks/01.03-Extending-Algorithms/01.03.01-Extending-Algorithms.ipynb
+++ b/tutorials/cuda-cpp/notebooks/01.03-Extending-Algorithms/01.03.01-Extending-Algorithms.ipynb
@@ -818,9 +818,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.11"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/01.03-Extending-Algorithms/01.03.02-Exercise-Computing-Variance.ipynb b/tutorials/cuda-cpp/notebooks/01.03-Extending-Algorithms/01.03.02-Exercise-Computing-Variance.ipynb
index 5a4c5e7a..2a3a0d52 100644
--- a/tutorials/cuda-cpp/notebooks/01.03-Extending-Algorithms/01.03.02-Exercise-Computing-Variance.ipynb
+++ b/tutorials/cuda-cpp/notebooks/01.03-Extending-Algorithms/01.03.02-Exercise-Computing-Variance.ipynb
@@ -173,7 +173,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -187,9 +187,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/01.04-Vocabulary-Types/01.04.01-Vocabulary-Types.ipynb b/tutorials/cuda-cpp/notebooks/01.04-Vocabulary-Types/01.04.01-Vocabulary-Types.ipynb
index 361a0f39..06f0b4a6 100644
--- a/tutorials/cuda-cpp/notebooks/01.04-Vocabulary-Types/01.04.01-Vocabulary-Types.ipynb
+++ b/tutorials/cuda-cpp/notebooks/01.04-Vocabulary-Types/01.04.01-Vocabulary-Types.ipynb
@@ -324,7 +324,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -338,9 +338,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/01.04-Vocabulary-Types/01.04.02-Exercise-mdspan.ipynb b/tutorials/cuda-cpp/notebooks/01.04-Vocabulary-Types/01.04.02-Exercise-mdspan.ipynb
index 79511278..7ad0c1e4 100644
--- a/tutorials/cuda-cpp/notebooks/01.04-Vocabulary-Types/01.04.02-Exercise-mdspan.ipynb
+++ b/tutorials/cuda-cpp/notebooks/01.04-Vocabulary-Types/01.04.02-Exercise-mdspan.ipynb
@@ -217,7 +217,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -231,9 +231,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/01.05-Serial-vs-Parallel/01.05.01-Serial-vs-Parallel.ipynb b/tutorials/cuda-cpp/notebooks/01.05-Serial-vs-Parallel/01.05.01-Serial-vs-Parallel.ipynb
index 9447cff3..f1b1ad0a 100644
--- a/tutorials/cuda-cpp/notebooks/01.05-Serial-vs-Parallel/01.05.01-Serial-vs-Parallel.ipynb
+++ b/tutorials/cuda-cpp/notebooks/01.05-Serial-vs-Parallel/01.05.01-Serial-vs-Parallel.ipynb
@@ -278,7 +278,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -292,9 +292,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/01.05-Serial-vs-Parallel/01.05.02-Exercise-Segmented-Sum-Optimization.ipynb b/tutorials/cuda-cpp/notebooks/01.05-Serial-vs-Parallel/01.05.02-Exercise-Segmented-Sum-Optimization.ipynb
index bbc7208e..e990a897 100644
--- a/tutorials/cuda-cpp/notebooks/01.05-Serial-vs-Parallel/01.05.02-Exercise-Segmented-Sum-Optimization.ipynb
+++ b/tutorials/cuda-cpp/notebooks/01.05-Serial-vs-Parallel/01.05.02-Exercise-Segmented-Sum-Optimization.ipynb
@@ -188,7 +188,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -202,9 +202,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/01.05-Serial-vs-Parallel/01.05.03-Exercise-Segmented-Mean.ipynb b/tutorials/cuda-cpp/notebooks/01.05-Serial-vs-Parallel/01.05.03-Exercise-Segmented-Mean.ipynb
index 1406d5da..60343c5b 100644
--- a/tutorials/cuda-cpp/notebooks/01.05-Serial-vs-Parallel/01.05.03-Exercise-Segmented-Mean.ipynb
+++ b/tutorials/cuda-cpp/notebooks/01.05-Serial-vs-Parallel/01.05.03-Exercise-Segmented-Mean.ipynb
@@ -245,7 +245,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -259,9 +259,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/01.06-Memory-Spaces/01.06.01-Memory-Spaces.ipynb b/tutorials/cuda-cpp/notebooks/01.06-Memory-Spaces/01.06.01-Memory-Spaces.ipynb
index 170eaf9f..aabd8521 100644
--- a/tutorials/cuda-cpp/notebooks/01.06-Memory-Spaces/01.06.01-Memory-Spaces.ipynb
+++ b/tutorials/cuda-cpp/notebooks/01.06-Memory-Spaces/01.06.01-Memory-Spaces.ipynb
@@ -183,7 +183,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -197,9 +197,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/01.06-Memory-Spaces/01.06.02-Exercise-Copy.ipynb b/tutorials/cuda-cpp/notebooks/01.06-Memory-Spaces/01.06.02-Exercise-Copy.ipynb
index 33a119d1..d0079fc6 100644
--- a/tutorials/cuda-cpp/notebooks/01.06-Memory-Spaces/01.06.02-Exercise-Copy.ipynb
+++ b/tutorials/cuda-cpp/notebooks/01.06-Memory-Spaces/01.06.02-Exercise-Copy.ipynb
@@ -186,7 +186,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -200,9 +200,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/01.07-Summary/01.07.01-Summary.ipynb b/tutorials/cuda-cpp/notebooks/01.07-Summary/01.07.01-Summary.ipynb
index e6429901..a92bc727 100644
--- a/tutorials/cuda-cpp/notebooks/01.07-Summary/01.07.01-Summary.ipynb
+++ b/tutorials/cuda-cpp/notebooks/01.07-Summary/01.07.01-Summary.ipynb
@@ -58,9 +58,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/01.08-Advanced/01.08.01-Advanced.ipynb b/tutorials/cuda-cpp/notebooks/01.08-Advanced/01.08.01-Advanced.ipynb
index ccb28c3c..25a173b2 100644
--- a/tutorials/cuda-cpp/notebooks/01.08-Advanced/01.08.01-Advanced.ipynb
+++ b/tutorials/cuda-cpp/notebooks/01.08-Advanced/01.08.01-Advanced.ipynb
@@ -193,9 +193,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/02.01-Introduction/02.01.01-Introduction.ipynb b/tutorials/cuda-cpp/notebooks/02.01-Introduction/02.01.01-Introduction.ipynb
index 81827177..dab1d4e6 100644
--- a/tutorials/cuda-cpp/notebooks/02.01-Introduction/02.01.01-Introduction.ipynb
+++ b/tutorials/cuda-cpp/notebooks/02.01-Introduction/02.01.01-Introduction.ipynb
@@ -78,9 +78,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/02.02-Asynchrony/02.02.01-Asynchrony.ipynb b/tutorials/cuda-cpp/notebooks/02.02-Asynchrony/02.02.01-Asynchrony.ipynb
index f2dc7016..8012d7ab 100644
--- a/tutorials/cuda-cpp/notebooks/02.02-Asynchrony/02.02.01-Asynchrony.ipynb
+++ b/tutorials/cuda-cpp/notebooks/02.02-Asynchrony/02.02.01-Asynchrony.ipynb
@@ -326,7 +326,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -340,9 +340,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/02.02-Asynchrony/02.02.02-Exercise-Compute-IO-Overlap.ipynb b/tutorials/cuda-cpp/notebooks/02.02-Asynchrony/02.02.02-Exercise-Compute-IO-Overlap.ipynb
index 3660909d..0f138db2 100644
--- a/tutorials/cuda-cpp/notebooks/02.02-Asynchrony/02.02.02-Exercise-Compute-IO-Overlap.ipynb
+++ b/tutorials/cuda-cpp/notebooks/02.02-Asynchrony/02.02.02-Exercise-Compute-IO-Overlap.ipynb
@@ -252,7 +252,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -266,9 +266,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/02.02-Asynchrony/02.02.03-Exercise-Nsight.ipynb b/tutorials/cuda-cpp/notebooks/02.02-Asynchrony/02.02.03-Exercise-Nsight.ipynb
index ab91a783..7438c5e7 100644
--- a/tutorials/cuda-cpp/notebooks/02.02-Asynchrony/02.02.03-Exercise-Nsight.ipynb
+++ b/tutorials/cuda-cpp/notebooks/02.02-Asynchrony/02.02.03-Exercise-Nsight.ipynb
@@ -118,7 +118,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -132,9 +132,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/02.02-Asynchrony/02.02.04-Exercise-NVTX.ipynb b/tutorials/cuda-cpp/notebooks/02.02-Asynchrony/02.02.04-Exercise-NVTX.ipynb
index 2342d1ce..2e18fe49 100644
--- a/tutorials/cuda-cpp/notebooks/02.02-Asynchrony/02.02.04-Exercise-NVTX.ipynb
+++ b/tutorials/cuda-cpp/notebooks/02.02-Asynchrony/02.02.04-Exercise-NVTX.ipynb
@@ -189,7 +189,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -203,9 +203,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/02.03-Streams/02.03.01-Streams.ipynb b/tutorials/cuda-cpp/notebooks/02.03-Streams/02.03.01-Streams.ipynb
index 6c879cb5..e6182fce 100644
--- a/tutorials/cuda-cpp/notebooks/02.03-Streams/02.03.01-Streams.ipynb
+++ b/tutorials/cuda-cpp/notebooks/02.03-Streams/02.03.01-Streams.ipynb
@@ -271,9 +271,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/02.03-Streams/02.03.02-Exercise-Async-Copy.ipynb b/tutorials/cuda-cpp/notebooks/02.03-Streams/02.03.02-Exercise-Async-Copy.ipynb
index 1f216d64..380014e4 100644
--- a/tutorials/cuda-cpp/notebooks/02.03-Streams/02.03.02-Exercise-Async-Copy.ipynb
+++ b/tutorials/cuda-cpp/notebooks/02.03-Streams/02.03.02-Exercise-Async-Copy.ipynb
@@ -215,7 +215,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -229,9 +229,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/02.04-Pinned-Memory/02.04.01-Pinned.ipynb b/tutorials/cuda-cpp/notebooks/02.04-Pinned-Memory/02.04.01-Pinned.ipynb
index 4b0aaf76..1858109e 100644
--- a/tutorials/cuda-cpp/notebooks/02.04-Pinned-Memory/02.04.01-Pinned.ipynb
+++ b/tutorials/cuda-cpp/notebooks/02.04-Pinned-Memory/02.04.01-Pinned.ipynb
@@ -140,9 +140,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/02.04-Pinned-Memory/02.04.02-Exercise-Copy-Overlap.ipynb b/tutorials/cuda-cpp/notebooks/02.04-Pinned-Memory/02.04.02-Exercise-Copy-Overlap.ipynb
index cd37127a..afdd88a7 100644
--- a/tutorials/cuda-cpp/notebooks/02.04-Pinned-Memory/02.04.02-Exercise-Copy-Overlap.ipynb
+++ b/tutorials/cuda-cpp/notebooks/02.04-Pinned-Memory/02.04.02-Exercise-Copy-Overlap.ipynb
@@ -223,7 +223,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -237,9 +237,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/03.01-Introduction/03.01-Introduction.ipynb b/tutorials/cuda-cpp/notebooks/03.01-Introduction/03.01-Introduction.ipynb
index 2e105eec..86a9195c 100644
--- a/tutorials/cuda-cpp/notebooks/03.01-Introduction/03.01-Introduction.ipynb
+++ b/tutorials/cuda-cpp/notebooks/03.01-Introduction/03.01-Introduction.ipynb
@@ -74,9 +74,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/03.02-Kernels/03.02.01-Kernels.ipynb b/tutorials/cuda-cpp/notebooks/03.02-Kernels/03.02.01-Kernels.ipynb
index 655497fa..040e2084 100644
--- a/tutorials/cuda-cpp/notebooks/03.02-Kernels/03.02.01-Kernels.ipynb
+++ b/tutorials/cuda-cpp/notebooks/03.02-Kernels/03.02.01-Kernels.ipynb
@@ -457,7 +457,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -471,9 +471,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/03.02-Kernels/03.02.02-Exercise-Symmetry.ipynb b/tutorials/cuda-cpp/notebooks/03.02-Kernels/03.02.02-Exercise-Symmetry.ipynb
index ae7bb957..262f3de0 100644
--- a/tutorials/cuda-cpp/notebooks/03.02-Kernels/03.02.02-Exercise-Symmetry.ipynb
+++ b/tutorials/cuda-cpp/notebooks/03.02-Kernels/03.02.02-Exercise-Symmetry.ipynb
@@ -217,9 +217,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/03.02-Kernels/03.02.03-Exercise-Row-Symmetry.ipynb b/tutorials/cuda-cpp/notebooks/03.02-Kernels/03.02.03-Exercise-Row-Symmetry.ipynb
index ec25580b..abc05da0 100644
--- a/tutorials/cuda-cpp/notebooks/03.02-Kernels/03.02.03-Exercise-Row-Symmetry.ipynb
+++ b/tutorials/cuda-cpp/notebooks/03.02-Kernels/03.02.03-Exercise-Row-Symmetry.ipynb
@@ -218,9 +218,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/03.02-Kernels/03.02.04-Dev-Tools.ipynb b/tutorials/cuda-cpp/notebooks/03.02-Kernels/03.02.04-Dev-Tools.ipynb
index cf8e3159..0d66d16a 100644
--- a/tutorials/cuda-cpp/notebooks/03.02-Kernels/03.02.04-Dev-Tools.ipynb
+++ b/tutorials/cuda-cpp/notebooks/03.02-Kernels/03.02.04-Dev-Tools.ipynb
@@ -135,9 +135,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/03.03-Atomics/03.03.01-Histogram.ipynb b/tutorials/cuda-cpp/notebooks/03.03-Atomics/03.03.01-Histogram.ipynb
index ce63473d..e16d3807 100644
--- a/tutorials/cuda-cpp/notebooks/03.03-Atomics/03.03.01-Histogram.ipynb
+++ b/tutorials/cuda-cpp/notebooks/03.03-Atomics/03.03.01-Histogram.ipynb
@@ -218,7 +218,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -232,9 +232,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/03.03-Atomics/03.03.02-Exercise-Fix-Histogram.ipynb b/tutorials/cuda-cpp/notebooks/03.03-Atomics/03.03.02-Exercise-Fix-Histogram.ipynb
index 2f5bc406..d938dcb4 100644
--- a/tutorials/cuda-cpp/notebooks/03.03-Atomics/03.03.02-Exercise-Fix-Histogram.ipynb
+++ b/tutorials/cuda-cpp/notebooks/03.03-Atomics/03.03.02-Exercise-Fix-Histogram.ipynb
@@ -205,9 +205,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/03.04-Synchronization/03.04.01-Sync.ipynb b/tutorials/cuda-cpp/notebooks/03.04-Synchronization/03.04.01-Sync.ipynb
index 32f556d5..b89f4b54 100644
--- a/tutorials/cuda-cpp/notebooks/03.04-Synchronization/03.04.01-Sync.ipynb
+++ b/tutorials/cuda-cpp/notebooks/03.04-Synchronization/03.04.01-Sync.ipynb
@@ -232,7 +232,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -246,9 +246,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/03.04-Synchronization/03.04.02-Exercise-Histogram.ipynb b/tutorials/cuda-cpp/notebooks/03.04-Synchronization/03.04.02-Exercise-Histogram.ipynb
index 7119af7b..a3418039 100644
--- a/tutorials/cuda-cpp/notebooks/03.04-Synchronization/03.04.02-Exercise-Histogram.ipynb
+++ b/tutorials/cuda-cpp/notebooks/03.04-Synchronization/03.04.02-Exercise-Histogram.ipynb
@@ -227,9 +227,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/03.05-Shared-Memory/03.05.01-Shared.ipynb b/tutorials/cuda-cpp/notebooks/03.05-Shared-Memory/03.05.01-Shared.ipynb
index 9f840251..26f52f19 100644
--- a/tutorials/cuda-cpp/notebooks/03.05-Shared-Memory/03.05.01-Shared.ipynb
+++ b/tutorials/cuda-cpp/notebooks/03.05-Shared-Memory/03.05.01-Shared.ipynb
@@ -149,9 +149,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/03.05-Shared-Memory/03.05.02-Exercise-Optimize-Histogram.ipynb b/tutorials/cuda-cpp/notebooks/03.05-Shared-Memory/03.05.02-Exercise-Optimize-Histogram.ipynb
index f09ab299..bbeb46da 100644
--- a/tutorials/cuda-cpp/notebooks/03.05-Shared-Memory/03.05.02-Exercise-Optimize-Histogram.ipynb
+++ b/tutorials/cuda-cpp/notebooks/03.05-Shared-Memory/03.05.02-Exercise-Optimize-Histogram.ipynb
@@ -225,9 +225,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/03.06-Cooperative-Algorithms/03.06.01-Cooperative.ipynb b/tutorials/cuda-cpp/notebooks/03.06-Cooperative-Algorithms/03.06.01-Cooperative.ipynb
index df4cdfc0..9785dcdb 100644
--- a/tutorials/cuda-cpp/notebooks/03.06-Cooperative-Algorithms/03.06.01-Cooperative.ipynb
+++ b/tutorials/cuda-cpp/notebooks/03.06-Cooperative-Algorithms/03.06.01-Cooperative.ipynb
@@ -210,9 +210,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-cpp/notebooks/03.06-Cooperative-Algorithms/03.06.02-Exercise-Cooperative-Histogram.ipynb b/tutorials/cuda-cpp/notebooks/03.06-Cooperative-Algorithms/03.06.02-Exercise-Cooperative-Histogram.ipynb
index 8b139f2a..2c77db23 100644
--- a/tutorials/cuda-cpp/notebooks/03.06-Cooperative-Algorithms/03.06.02-Exercise-Cooperative-Histogram.ipynb
+++ b/tutorials/cuda-cpp/notebooks/03.06-Cooperative-Algorithms/03.06.02-Exercise-Cooperative-Histogram.ipynb
@@ -223,9 +223,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/cuda-tile/notebooks/01__cutile_python_intro__vector_add.ipynb b/tutorials/cuda-tile/notebooks/01__cutile_python_intro__vector_add.ipynb
index 8f40eefe..d6491665 100644
--- a/tutorials/cuda-tile/notebooks/01__cutile_python_intro__vector_add.ipynb
+++ b/tutorials/cuda-tile/notebooks/01__cutile_python_intro__vector_add.ipynb
@@ -289,9 +289,11 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
   "colab": {
    "gpuType": "T4",
-   "provenance": []
+   "provenance": [],
+   "toc_visible": true
   },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
@@ -299,7 +301,16 @@
    "name": "python3"
   },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/cuda-tile/notebooks/start.ipynb b/tutorials/cuda-tile/notebooks/start.ipynb
index 8bf0f177..a1a05b11 100644
--- a/tutorials/cuda-tile/notebooks/start.ipynb
+++ b/tutorials/cuda-tile/notebooks/start.ipynb
@@ -20,9 +20,11 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
   "colab": {
    "gpuType": "T4",
-   "provenance": []
+   "provenance": [],
+   "toc_visible": true
   },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
@@ -30,7 +32,16 @@
    "name": "python3"
   },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/nvmath-python/notebooks/01_kernel_fusion.ipynb b/tutorials/nvmath-python/notebooks/01_kernel_fusion.ipynb
index 263e9e60..812b2c47 100644
--- a/tutorials/nvmath-python/notebooks/01_kernel_fusion.ipynb
+++ b/tutorials/nvmath-python/notebooks/01_kernel_fusion.ipynb
@@ -618,6 +618,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -633,7 +639,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.7"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/nvmath-python/notebooks/01_kernel_fusion_SOLUTION.ipynb b/tutorials/nvmath-python/notebooks/01_kernel_fusion_SOLUTION.ipynb
index 469e85fb..cd3c4794 100644
--- a/tutorials/nvmath-python/notebooks/01_kernel_fusion_SOLUTION.ipynb
+++ b/tutorials/nvmath-python/notebooks/01_kernel_fusion_SOLUTION.ipynb
@@ -126,8 +126,14 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -141,7 +147,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.7"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/nvmath-python/notebooks/02_mem_exec_spaces.ipynb b/tutorials/nvmath-python/notebooks/02_mem_exec_spaces.ipynb
index 2c379cd6..eb34e82e 100644
--- a/tutorials/nvmath-python/notebooks/02_mem_exec_spaces.ipynb
+++ b/tutorials/nvmath-python/notebooks/02_mem_exec_spaces.ipynb
@@ -404,6 +404,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -419,7 +425,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.7"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/nvmath-python/notebooks/02_mem_exec_spaces_SOLUTION.ipynb b/tutorials/nvmath-python/notebooks/02_mem_exec_spaces_SOLUTION.ipynb
index 9f491da3..3bddbaed 100644
--- a/tutorials/nvmath-python/notebooks/02_mem_exec_spaces_SOLUTION.ipynb
+++ b/tutorials/nvmath-python/notebooks/02_mem_exec_spaces_SOLUTION.ipynb
@@ -96,8 +96,14 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -111,7 +117,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.7"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/nvmath-python/notebooks/03_stateful_api.ipynb b/tutorials/nvmath-python/notebooks/03_stateful_api.ipynb
index bb8673f4..ff103dfd 100644
--- a/tutorials/nvmath-python/notebooks/03_stateful_api.ipynb
+++ b/tutorials/nvmath-python/notebooks/03_stateful_api.ipynb
@@ -499,6 +499,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -514,7 +520,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.7"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/nvmath-python/notebooks/03_stateful_api_SOLUTION.ipynb b/tutorials/nvmath-python/notebooks/03_stateful_api_SOLUTION.ipynb
index c3a3acdb..a8d7f323 100644
--- a/tutorials/nvmath-python/notebooks/03_stateful_api_SOLUTION.ipynb
+++ b/tutorials/nvmath-python/notebooks/03_stateful_api_SOLUTION.ipynb
@@ -115,8 +115,14 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -130,7 +136,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.7"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/nvmath-python/notebooks/04_callbacks.ipynb b/tutorials/nvmath-python/notebooks/04_callbacks.ipynb
index 05a1ee40..7dc58f99 100644
--- a/tutorials/nvmath-python/notebooks/04_callbacks.ipynb
+++ b/tutorials/nvmath-python/notebooks/04_callbacks.ipynb
@@ -991,6 +991,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -1006,7 +1012,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.7"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/nvmath-python/notebooks/04_callbacks_SOLUTION.ipynb b/tutorials/nvmath-python/notebooks/04_callbacks_SOLUTION.ipynb
index 67d7b664..dd6257ac 100644
--- a/tutorials/nvmath-python/notebooks/04_callbacks_SOLUTION.ipynb
+++ b/tutorials/nvmath-python/notebooks/04_callbacks_SOLUTION.ipynb
@@ -314,8 +314,14 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -329,7 +335,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.7"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/nvmath-python/notebooks/05_device_api.ipynb b/tutorials/nvmath-python/notebooks/05_device_api.ipynb
index 5729d78c..d4bc141a 100644
--- a/tutorials/nvmath-python/notebooks/05_device_api.ipynb
+++ b/tutorials/nvmath-python/notebooks/05_device_api.ipynb
@@ -922,6 +922,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -937,7 +943,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.7"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/nvmath-python/notebooks/05_device_api_SOLUTION.ipynb b/tutorials/nvmath-python/notebooks/05_device_api_SOLUTION.ipynb
index 0bad96e4..6ff151af 100644
--- a/tutorials/nvmath-python/notebooks/05_device_api_SOLUTION.ipynb
+++ b/tutorials/nvmath-python/notebooks/05_device_api_SOLUTION.ipynb
@@ -350,8 +350,14 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -365,7 +371,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.7"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/nvmath-python/notebooks/06_sparse_solver.ipynb b/tutorials/nvmath-python/notebooks/06_sparse_solver.ipynb
index 0d135578..20c8d0bf 100644
--- a/tutorials/nvmath-python/notebooks/06_sparse_solver.ipynb
+++ b/tutorials/nvmath-python/notebooks/06_sparse_solver.ipynb
@@ -446,8 +446,14 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -461,7 +467,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.7"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/nvmath-python/notebooks/06_sparse_solver_SOLUTION.ipynb b/tutorials/nvmath-python/notebooks/06_sparse_solver_SOLUTION.ipynb
index 8ef19d55..48ecaf7e 100644
--- a/tutorials/nvmath-python/notebooks/06_sparse_solver_SOLUTION.ipynb
+++ b/tutorials/nvmath-python/notebooks/06_sparse_solver_SOLUTION.ipynb
@@ -138,8 +138,14 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -153,7 +159,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.7"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/stdpar/notebooks/cpp/lab1_daxpy/daxpy.ipynb b/tutorials/stdpar/notebooks/cpp/lab1_daxpy/daxpy.ipynb
index 8cba008e..353a0ce1 100644
--- a/tutorials/stdpar/notebooks/cpp/lab1_daxpy/daxpy.ipynb
+++ b/tutorials/stdpar/notebooks/cpp/lab1_daxpy/daxpy.ipynb
@@ -907,6 +907,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -922,9 +928,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/stdpar/notebooks/cpp/lab1_select/select.ipynb b/tutorials/stdpar/notebooks/cpp/lab1_select/select.ipynb
index 72e88427..b466a967 100644
--- a/tutorials/stdpar/notebooks/cpp/lab1_select/select.ipynb
+++ b/tutorials/stdpar/notebooks/cpp/lab1_select/select.ipynb
@@ -228,6 +228,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -243,9 +249,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/stdpar/notebooks/cpp/lab2_heat/heat.ipynb b/tutorials/stdpar/notebooks/cpp/lab2_heat/heat.ipynb
index d4962d1d..b2659831 100644
--- a/tutorials/stdpar/notebooks/cpp/lab2_heat/heat.ipynb
+++ b/tutorials/stdpar/notebooks/cpp/lab2_heat/heat.ipynb
@@ -471,6 +471,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -486,9 +492,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/stdpar/notebooks/cpp/lab3_tree/tree.ipynb b/tutorials/stdpar/notebooks/cpp/lab3_tree/tree.ipynb
index 64363544..f40521f9 100644
--- a/tutorials/stdpar/notebooks/cpp/lab3_tree/tree.ipynb
+++ b/tutorials/stdpar/notebooks/cpp/lab3_tree/tree.ipynb
@@ -176,6 +176,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -191,9 +197,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/stdpar/notebooks/cpp/start.ipynb b/tutorials/stdpar/notebooks/cpp/start.ipynb
index b0cc4ef1..c79c9e81 100644
--- a/tutorials/stdpar/notebooks/cpp/start.ipynb
+++ b/tutorials/stdpar/notebooks/cpp/start.ipynb
@@ -101,6 +101,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -116,9 +122,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/stdpar/notebooks/fortran/lab1_matmul/matmul.ipynb b/tutorials/stdpar/notebooks/fortran/lab1_matmul/matmul.ipynb
index 07018f7c..13792f7a 100755
--- a/tutorials/stdpar/notebooks/fortran/lab1_matmul/matmul.ipynb
+++ b/tutorials/stdpar/notebooks/fortran/lab1_matmul/matmul.ipynb
@@ -114,6 +114,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -129,9 +135,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/stdpar/notebooks/fortran/lab2_daxpy/daxpy.ipynb b/tutorials/stdpar/notebooks/fortran/lab2_daxpy/daxpy.ipynb
index 84a953e5..940294e7 100644
--- a/tutorials/stdpar/notebooks/fortran/lab2_daxpy/daxpy.ipynb
+++ b/tutorials/stdpar/notebooks/fortran/lab2_daxpy/daxpy.ipynb
@@ -285,6 +285,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -300,9 +306,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/stdpar/notebooks/fortran/lab3_heat/heat.ipynb b/tutorials/stdpar/notebooks/fortran/lab3_heat/heat.ipynb
index 9583e8de..c7cf820a 100644
--- a/tutorials/stdpar/notebooks/fortran/lab3_heat/heat.ipynb
+++ b/tutorials/stdpar/notebooks/fortran/lab3_heat/heat.ipynb
@@ -402,6 +402,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -417,9 +423,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/stdpar/notebooks/fortran/start.ipynb b/tutorials/stdpar/notebooks/fortran/start.ipynb
index c2e1c971..db0e5736 100644
--- a/tutorials/stdpar/notebooks/fortran/start.ipynb
+++ b/tutorials/stdpar/notebooks/fortran/start.ipynb
@@ -116,6 +116,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -131,9 +137,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/tutorials/stdpar/notebooks/start.ipynb b/tutorials/stdpar/notebooks/start.ipynb
index e8b90954..b4e64a65 100644
--- a/tutorials/stdpar/notebooks/start.ipynb
+++ b/tutorials/stdpar/notebooks/start.ipynb
@@ -14,6 +14,12 @@
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": [],
+   "toc_visible": true
+  },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -29,9 +35,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }